In [1]:
import os
import gym
import cv2
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import time
import json
import random
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from collections import deque

ENVIRONMENT = "PongDeterministic-v4"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

SAVE_MODELS = False  # Save models to file so you can test later
MODEL_PATH = "./pong-cnn-"  # Models path for saving or loading
SAVE_MODEL_INTERVAL = 10  # Save models at every X epoch
TRAIN_MODEL = True  # Train model while playing (Make it False when testing a model)

LOAD_MODEL_FROM_FILE = False #True  # Load model from file
LOAD_FILE_EPISODE = 0 #900  # Load Xth episode from file

BATCH_SIZE = 64  # Minibatch size that select randomly from mem for train nets
MAX_EPISODE = 900 #100000  # Max episode
MAX_STEP = 300 #100000  # Max step size for one episode

MAX_MEMORY_LEN = 25000  # Max memory len
MIN_MEMORY_LEN = 10000  # Min memory len before start train

GAMMA = 0.97  # Discount rate
ALPHA = 0.00025  # Learning rate
EPSILON_DECAY = 0.99  # Epsilon decay rate by step

RENDER_GAME_WINDOW = False #True  # Opens a new window to render


class DuelCNN_Learner(nn.Module):
    """
    CNN with Duel Algo. https://arxiv.org/abs/1511.06581
    """
    def __init__(self, h, w, output_size):
        super(DuelCNN_Learner, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=4,  out_channels=32, kernel_size=8, stride=4)
        self.bn1 = nn.BatchNorm2d(32)
        convw, convh = self.conv2d_size_calc(w, h, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2)
        self.bn2 = nn.BatchNorm2d(64)
        convw, convh = self.conv2d_size_calc(convw, convh, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1)
        self.bn3 = nn.BatchNorm2d(64)
        convw, convh = self.conv2d_size_calc(convw, convh, kernel_size=3, stride=1)

        linear_input_size = convw * convh * 64  # Last conv layer's out sizes

        # Action layer
        self.Alinear1 = nn.Linear(in_features=linear_input_size, out_features=128)
        self.Alrelu = nn.LeakyReLU()  # Linear 1 activation funct
        self.Alinear2 = nn.Linear(in_features=128, out_features=output_size)

        # State Value layer
        self.Vlinear1 = nn.Linear(in_features=linear_input_size, out_features=128)
        self.Vlrelu = nn.LeakyReLU()  # Linear 1 activation funct
        self.Vlinear2 = nn.Linear(in_features=128, out_features=1)  # Only 1 node

    def conv2d_size_calc(self, w, h, kernel_size=5, stride=2):
        """
        Calcs conv layers output image sizes
        """
        next_w = (w - (kernel_size - 1) - 1) // stride + 1
        next_h = (h - (kernel_size - 1) - 1) // stride + 1
        return next_w, next_h

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))

        x = x.view(x.size(0), -1)  # Flatten every batch

        Ax = self.Alrelu(self.Alinear1(x))
        Ax = self.Alinear2(Ax)  # No activation on last layer

        Vx = self.Vlrelu(self.Vlinear1(x))
        Vx = self.Vlinear2(Vx)  # No activation on last layer

        q = Vx + (Ax - Ax.mean())

        return q


class Agent:
    def __init__(self, environment):
        """
        Hyperparameters definition for Agent
        """
        # State size for breakout env. SS images (210, 160, 3). Used as input size in network
        self.state_size_h = environment.observation_space.shape[0]
        self.state_size_w = environment.observation_space.shape[1]
        self.state_size_c = environment.observation_space.shape[2]

        # Activation size for breakout env. Used as output size in network
        self.action_size = environment.action_space.n

        # Image pre process params
        self.target_h = 80  # Height after process
        self.target_w = 64  # Widht after process

        self.crop_dim = [20, self.state_size_h, 0, self.state_size_w]  # Cut 20 px from top to get rid of the score table

        # Trust rate to our experiences
        self.gamma = GAMMA  # Discount coef for future predictions
        self.alpha = ALPHA  # Learning Rate

        # After many experinces epsilon will be 0.05
        # So we will do less Explore more Exploit
        self.epsilon = 1  # Explore or Exploit
        self.epsilon_decay = EPSILON_DECAY  # Adaptive Epsilon Decay Rate
        self.epsilon_minimum = 0.05  # Minimum for Explore

        # Deque holds replay mem.
        self.memory = deque(maxlen=MAX_MEMORY_LEN)

        # Create two model for DDQN algorithm
        self.online_model = DuelCNN_Learner(h=self.target_h, w=self.target_w, output_size=self.action_size).to(DEVICE)
        self.target_model = DuelCNN_Learner(h=self.target_h, w=self.target_w, output_size=self.action_size).to(DEVICE)
        self.target_model.load_state_dict(self.online_model.state_dict())
        self.target_model.eval()

        # Adam used as optimizer
        self.optimizer = optim.Adam(self.online_model.parameters(), lr=self.alpha)

    def preProcess(self, image):
        """
        Process image crop resize, grayscale and normalize the images
        """
        frame = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  # To grayscale
        frame = frame[self.crop_dim[0]:self.crop_dim[1], self.crop_dim[2]:self.crop_dim[3]]  # Cut 20 px from top
        frame = cv2.resize(frame, (self.target_w, self.target_h))  # Resize
        frame = frame.reshape(self.target_w, self.target_h) / 255  # Normalize

        return frame

    def act(self, state):
        """
        Get state and do action
        Two option can be selectedd if explore select random action
        if exploit ask nnet for action
        """

        act_protocol = 'Explore' if random.uniform(0, 1) <= self.epsilon else 'Exploit'

        if act_protocol == 'Explore':
            action = random.randrange(self.action_size)
        else:
            with torch.no_grad():
                state = torch.tensor(state, dtype=torch.float, device=DEVICE).unsqueeze(0)
                q_values = self.online_model.forward(state)  # (1, action_size)
                action = torch.argmax(q_values).item()  # Returns the indices of the maximum value of all elements

        return action
    
    def extract_data(self,agent):
        """
        Train neural nets with replay memory
        returns loss and max_q val predicted from online_net
        """
        update = True
        if len(agent.memory) < MIN_MEMORY_LEN:
            loss, max_q = [0, 0]
            update = False
            return loss, max_q, 0,update, 0, 0, 0, 0, 0
        # We get out minibatch and turn it to numpy array
        state, action, reward, next_state, done = zip(*random.sample(self.memory, BATCH_SIZE))
        #print ("~~~~",type(state))
        #state, action, reward, next_state, done = zip(*self.memory)
        
        # Concat batches in one array
        # (np.arr, np.arr) ==> np.BIGarr
        state = np.concatenate(state)
        next_state = np.concatenate(next_state)

        # Convert them to tensors
        state = torch.tensor(state, dtype=torch.float, device=DEVICE)
        next_state = torch.tensor(next_state, dtype=torch.float, device=DEVICE)
        action = torch.tensor(action, dtype=torch.long, device=DEVICE)
        reward = torch.tensor(reward, dtype=torch.float, device=DEVICE)
        done = torch.tensor(done, dtype=torch.float, device=DEVICE)
        state_q_values_ = self.online_model(state)
        
        return 0, 0, state_q_values_, update, state, next_state, action, reward, done
        
        ###
    def train(self, agent, state, next_state, action, reward, done):
        # Make predictions
        state_q_values = self.online_model(state)
        next_states_q_values = self.online_model(next_state)
        next_states_target_q_values = self.target_model(next_state)

        # Find selected action's q_value
        selected_q_value = state_q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        # Get indice of the max value of next_states_q_values
        # Use that indice to get a q_value from next_states_target_q_values
        # We use greedy for policy So it called off-policy
        next_states_target_q_value = next_states_target_q_values.gather(1, next_states_q_values.max(1)[1].unsqueeze(1)).squeeze(1)
        # Use Bellman function to find expected q value
        expected_q_value = reward + self.gamma * next_states_target_q_value * (1 - done)

        # Calc loss with expected_q_value and q_value
        loss = (selected_q_value - expected_q_value.detach()).pow(2).mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss, torch.max(state_q_values).item()

    def storeResults(self, state, action, reward, nextState, done):
        """
        Store every result to memory
        """
        self.memory.append([state[None, :], action, reward, nextState[None, :], done])

    def adaptiveEpsilon(self):
        """
        Adaptive Epsilon means every step
        we decrease the epsilon so we do less Explore
        """
        if self.epsilon > self.epsilon_minimum:
            self.epsilon *= self.epsilon_decay
    
    def compute_Q(self, state):
        state_q_values_ = self.online_model(state)
        return state_q_values_



In [2]:
import os
import gym
import cv2
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import time
import json
import random
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from copy import deepcopy as c

from collections import deque
#from args import get_train_args

ENVIRONMENT = "PongDeterministic-v4"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

SAVE_MODELS = False  # Save models to file so you can test later
MODEL_PATH = "./pong-cnn-"  # Models path for saving or loading
SAVE_MODEL_INTERVAL = 10  # Save models at every X epoch
TRAIN_MODEL = True  # Train model while playing (Make it False when testing a model)

LOAD_MODEL_FROM_FILE = False #True  # Load model from file
LOAD_FILE_EPISODE = 0 #900  # Load Xth episode from file

BATCH_SIZE = 64  # Minibatch size that select randomly from mem for train nets
MAX_EPISODE = 300 #100000  # Max episode
MAX_STEP = 100000  # Max step size for one episode

MAX_MEMORY_LEN = 64 #20000  # Max memory len
MIN_MEMORY_LEN = 64 #5000  # Min memory len before start train


MAX_REPLAY_MEMORY_LEN = 25000  # Max memory len
MIN_REPLAY_MEMORY_LEN = 1000  # 64 Min memory len before start train

GAMMA = 0.97  # Discount rate
ALPHA = 0.00025  # Learning rate
EPSILON_DECAY = 0.99  # Epsilon decay rate by step

RENDER_GAME_WINDOW = False #True  # Opens a new window to render


class DuelCNN_Attacker(nn.Module):
    """
    CNN with Duel Algo. https://arxiv.org/abs/1511.06581
    """
    def __init__(self, info, h, w, output_size):
        super(DuelCNN_Attacker, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=4,  out_channels=32, kernel_size=8, stride=4)
        self.bn1 = nn.BatchNorm2d(32)
        convw, convh = self.conv2d_size_calc(w, h, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2)
        self.bn2 = nn.BatchNorm2d(64)
        convw, convh = self.conv2d_size_calc(convw, convh, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1)
        self.bn3 = nn.BatchNorm2d(64)
        convw, convh = self.conv2d_size_calc(convw, convh, kernel_size=3, stride=1)

        linear_input_size = convw * convh * 64  # Last conv layer's out sizes

        # Action layer
        self.Alinear1 = nn.Linear(in_features=linear_input_size, out_features=128)
        self.Alrelu = nn.LeakyReLU()  # Linear 1 activation funct
        ## Added info layer For Action
        self.Ainfolayer1 = nn.Linear(in_features=128, out_features=12)
        self.Ainforelu1 = nn.LeakyReLU()

        self.Ainfolayer2 = nn.Linear(in_features=12+info, out_features=12)
        self.Ainforelu2 = nn.LeakyReLU()

        self.Alinear2 = nn.Linear(in_features=12, out_features=output_size)

        # State Value layer
        self.Vlinear1 = nn.Linear(in_features=linear_input_size, out_features=128)
        self.Vlrelu = nn.LeakyReLU()  # Linear 1 activation funct
        ## Added info layer For Value
        self.Vinfolayer1 = nn.Linear(in_features=128, out_features=12)
        self.Vinforelu1 = nn.LeakyReLU()

        self.Vinfolayer2 = nn.Linear(in_features=12+info, out_features=12)
        self.Vinforelu2 = nn.LeakyReLU()

        self.Vlinear2 = nn.Linear(in_features=12, out_features=1)  # Only 1 node

    def conv2d_size_calc(self, w, h, kernel_size=5, stride=2):
        """
        Calcs conv layers output image sizes
        """
        next_w = (w - (kernel_size - 1) - 1) // stride + 1
        next_h = (h - (kernel_size - 1) - 1) // stride + 1
        return next_w, next_h

    def forward(self, x, info):
        if (info.shape[0] == 18 and x.shape[0] != 18):
           info_in = torch.unsqueeze(torch.from_numpy(info), 0).float().to(DEVICE)        
        else:
          info_in = torch.from_numpy(info).float().to(DEVICE)

        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))

        x = x.view(x.size(0), -1)  # Flatten every batch

        Ax = self.Alrelu(self.Alinear1(x)) #[1,128]

        Ax = self.Ainforelu1(self.Ainfolayer1(Ax)) #[1,12]
        #print (Ax.size(), info_in.size())
        Ax = torch.cat((Ax,info_in), 1)  #[1,29] concat layer
        
        Ax = self.Ainforelu2(self.Ainfolayer2(Ax))
        
        Ax = self.Alinear2(Ax)  # No activation on last layer
        
        Vx = self.Vlrelu(self.Vlinear1(x))
        
        Vx = self.Vinforelu1(self.Vinfolayer1(Vx)) #[1,12]
        
        Vx = torch.cat((Vx,info_in), 1)  #[1,29] concat layer
        
        Vx = self.Vinforelu2(self.Vinfolayer2(Vx))
        
        Vx = self.Vlinear2(Vx)  # No activation on last layer
        
        q = Vx + (Ax - Ax.mean())

        return q


class Attacker:
    def __init__(self, environment, disk_size, xi_size, zeta, delta):
        """
        Hyperparameters definition for Agent
        """
        #args = get_train_args()
        #self.info_dim = args.disk_size + args.xi_size
        self.disk_size = disk_size #args.disk_size
        self.info_dim = disk_size+xi_size #args.disk_size + args.xi_size

        # State size for breakout env. SS images (210, 160, 3). Used as input size in network
        self.state_size_h = environment.observation_space.shape[0]
        self.state_size_w = environment.observation_space.shape[1]
        self.state_size_c = environment.observation_space.shape[2]

        # Activation size for breakout env. Used as output size in network
        self.action_size = disk_size #args.disk_size + 1 #environment.action_space.n

        # Image pre process params
        self.target_h = 80  # Height after process
        self.target_w = 64  # Widht after process

        self.crop_dim = [20, self.state_size_h, 0, self.state_size_w]  # Cut 20 px from top to get rid of the score table

        # Trust rate to our experiences
        self.gamma = GAMMA  # Discount coef for future predictions
        self.alpha = ALPHA  # Learning Rate

        # After many experinces epsilon will be 0.05
        # So we will do less Explore more Exploit
        self.epsilon = 1  # Explore or Exploit
        self.epsilon_decay = EPSILON_DECAY  # Adaptive Epsilon Decay Rate
        self.epsilon_minimum = 0.05  # Minimum for Explore

        # Deque holds replay mem.
        self.memory = deque(maxlen=64)
        self.replay_memory = deque(maxlen=MAX_REPLAY_MEMORY_LEN)

        # Create two model for DDQN algorithm
        self.online_model = DuelCNN_Attacker(info = self.info_dim, h=self.target_h, w=self.target_w, output_size=self.action_size).to(DEVICE)
        self.target_model = DuelCNN_Attacker(info = self.info_dim, h=self.target_h, w=self.target_w, output_size=self.action_size).to(DEVICE)
        self.target_model.load_state_dict(self.online_model.state_dict())
        self.target_model.eval()

        # Adam used as optimizer
        self.optimizer = optim.Adam(self.online_model.parameters(), lr=self.alpha)


        self.disk=np.zeros([disk_size,2])#disk_size
        self.zeta=zeta
        self.empty_positions=set([i for i in range(disk_size)])
        self.delta=delta
        self.disk_size=disk_size
        self.save_list=[]
        #self.memory=deque()

    def preProcess(self, image):
        """
        Process image crop resize, grayscale and normalize the images
        """
        frame = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  # To grayscale
        frame = frame[self.crop_dim[0]:self.crop_dim[1], self.crop_dim[2]:self.crop_dim[3]]  # Cut 20 px from top
        frame = cv2.resize(frame, (self.target_w, self.target_h))  # Resize
        frame = frame.reshape(self.target_w, self.target_h) / 255  # Normalize

        return frame

    def act(self, state, info):
        """
        Get state and do action
        Two option can be selectedd if explore select random action
        if exploit ask nnet for action
        """
        #act_protocol = 'Explore' if random.uniform(0, 1) <= self.epsilon else 'Exploit'
        act_protocol = 'Explore' if random.uniform(0, 1) <= 0.0 else 'Exploit'

        if act_protocol == 'Explore':
            action = random.randrange(self.action_size)
            q_values = 0
        else:
            with torch.no_grad():
                state = torch.tensor(state, dtype=torch.float, device=DEVICE).unsqueeze(0)
                q_values = self.online_model.forward(state, info)  # (1, action_size)
                action = torch.argmax(q_values).item()  # Returns the indices of the maximum value of all elements

        return action, q_values

    def train(self, subset, done):
        if (len(self.replay_memory) < BATCH_SIZE):
            return 0,0
        
        state, action, reward, next_state, done = zip(*random.sample(self.replay_memory, BATCH_SIZE))              
        s_list = list()
        info_list = list()
        reward_list =list()
        next_state_list = list()
        next_info_list = list()
        action_list = list()
        done_list = list()
        for i in range(BATCH_SIZE):
            s_list.append(torch.tensor(state[i][0], dtype=torch.float, device=DEVICE))
            next_state_list.append(torch.tensor(next_state[i][0], dtype=torch.float, device=DEVICE))
            info_list.append(state[i][1])
            reward_list.append(reward[i])
            next_info_list.append(next_state[i][1])
            action_list.append(torch.tensor(action[i], dtype=torch.float, device=DEVICE))
            done_list.append(done[i])
            
        info_attacker = np.stack(info_list, axis = 0)
        next_info_attacker = np.stack(next_info_list, axis = 0)
        state = torch.stack(s_list)
        next_state = torch.stack(next_state_list)
        reward_attacker = torch.stack(reward_list, axis = 0)
        action = torch.stack(action_list)
        done_attacker = torch.stack(done_list)
        
        
        # Convert them to tensors
        state = torch.tensor(state, dtype=torch.float, device=DEVICE)
        next_state = torch.tensor(next_state, dtype=torch.float, device=DEVICE)
        action = torch.tensor(action, dtype=torch.long, device=DEVICE)
        reward = torch.tensor(reward_attacker, dtype=torch.float, device=DEVICE)
        done = torch.tensor(done_attacker, dtype=torch.float, device=DEVICE)
   
        # Make predictions
        state_q_values = self.online_model(state,info_attacker)
        
        next_states_q_values = self.online_model(next_state,next_info_attacker)
        
        next_states_target_q_values = self.target_model(next_state,next_info_attacker)
        
        # Find selected action's q_value
        
        selected_q_value = state_q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        
        # Get indice of the max value of next_states_q_values
        # Use that indice to get a q_value from next_states_target_q_values
        # We use greedy for policy So it called off-policy
        next_states_target_q_value = next_states_target_q_values.gather(1, next_states_q_values.max(1)[1].unsqueeze(1)).squeeze(1)
        # Use Bellman function to find expected q value
        
        expected_q_value = reward + self.gamma * next_states_target_q_value * (1 - done)

        #print (selected_q_value.shape, expected_q_value.shape)
        # Calc loss with expected_q_value and q_value
        loss = (selected_q_value - expected_q_value.detach()).pow(2).mean()
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss, torch.max(state_q_values).item()
        
    def storeResults(self, state, action, reward, nextState, done):
        """
        Store every result to memory
        """
        self.memory.append([state[None, :], action, reward, nextState[None, :], done])

    def adaptiveEpsilon(self):
        """
        Adaptive Epsilon means every step
        we decrease the epsilon so we do less Explore
        """
        if self.epsilon > self.epsilon_minimum:
            self.epsilon *= self.epsilon_decay

    ### attacker MDP relatd function
    def return_memory(self):
        if len(self.memory)>0:
            memory=self.memory
            #self.memory=deque()
            return memory
        else:
            return None

    def __retrieve_infor__(self,xi):
        
        index=self.empty_positions.pop()
        retrieve_reward=xi[-1]
        self.disk[index,0]=retrieve_reward
        self.disk[index,1]=0
        return index,retrieve_reward

    def __delay_attack_strategy__(self,obs_attacker,action):
        if (action==0):
            send_back_index = np.argmax(self.disk[:,-2])+1
            return send_back_index
        else :
            send_back_index = np.argmin(self.disk[:,-2])+1
            return send_back_index
        
    def step_one(self, args, state, next_state, action, reward, done):
        poison_reward = reward  
        obs_attacker = list()
        xi = np.zeros(7)
        state_attacker = state
        obs_attacker.append(state_attacker)
        xi[-1] = reward
        xi[action] = 1
        if len(self.save_list)!=0:
            change=self.save_list.pop(-1)
            reward_save = 0 # temporary reward
            self.save_list.append(reward_save)
        
        index,retrieve_reward=self.__retrieve_infor__(xi)
        info_attacker=c(np.concatenate([xi,self.disk[:,0]]))
        obs_attacker.append(info_attacker)
        self.save_list.append(obs_attacker)
        #save needed information for training attacker agent
        if len(self.save_list)==4:
            self.memory.append(tuple(self.save_list))
            self.save_list=[c(obs_attacker)]

        send_back_index =self.__delay_attack_strategy__(obs_attacker,action)
        """
        If the model predict None or disk position without information, send back None
        """
        #if send_back_index==0 or send_back_index-1 in self.empty_positions:
        if send_back_index-1 in self.empty_positions: 
            send_back_reward=None
        else:
            
            send_back_reward=self.disk[send_back_index-1,-2]
            self.empty_positions.add(send_back_index-1)
            
        for j in range(self.disk_size):
            if j not in self.empty_positions:
                self.disk[j,-1]=self.disk[j,-1]+1

        self.save_list.append(send_back_index-1)
        change=send_back_reward!=retrieve_reward
        self.save_list.append(change)
        if send_back_reward is not None:
            return send_back_reward
        else:
            return None  
    
    def __dynamic_reward__(self, state_q_values_prev,state_q_values_now, act_teacher):
        ''' uncomment the following commented lines for untargeted attack and also commented out the following uncommented lines
        updated_policy = torch.softmax(state_q_values_now, dim=-1) #(PI_(t+1))
        reward_attacker_ = - (torch.sum(torch.mul(updated_policy, state_q_values_prev)))
        #reward_attacker_ = - (self.pearsonr(state_q_values_now,state_q_values_prev))
        return reward_attacker_
        '''
        # compute CE LAST
        loss_CE = torch.nn.CrossEntropyLoss()
        target_q_value_tensor = torch.zeros(int(state_q_values_prev.shape[0]), device = DEVICE)
        index = torch.tensor([0]).cuda() #torch.tensor([0,2]).cuda()#No Operation, Fire, Right, Left, Right Fire, Left Fire
        target_q_value_tensor.index_fill_(0, index, 1).cuda()
        target_q_value_dist_sq = target_q_value_tensor #(1 / 2) * target_q_value_tensor
        target_q_value_dist = torch.unsqueeze(target_q_value_dist_sq,0)
        prev_CE_loss = loss_CE(state_q_values_prev.unsqueeze(0), target_q_value_dist.cuda())
        
        # COMPUTE CE NOW
        now_CE_loss = loss_CE(state_q_values_now.unsqueeze(0), target_q_value_dist.cuda())
        reward_attacker_targeted = (prev_CE_loss - now_CE_loss)
        if (reward_attacker_targeted > 0):
            reward_attacker_ = torch.tensor([1]).cuda()
            return reward_attacker_
        else:
            reward_attacker_ = torch.tensor([-1]).cuda()
            return reward_attacker_
        
    def pearsonr(self,x, y):
        mean_x = torch.mean(x)
        mean_y = torch.mean(y)
        xm = x.sub(mean_x)
        ym = y.sub(mean_y)
        r_num = xm.dot(ym)
        r_den = torch.norm(xm, 2) * torch.norm(ym, 2)
        r_val = r_num / r_den
        return r_val

    

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import gym
import cv2
import time
import json
import random
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from copy import deepcopy as c
import collections
import itertools
from collections import deque

#from model_pong import Agent


ENVIRONMENT = "PongDeterministic-v4"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

SAVE_MODELS = False  # Save models to file so you can test later
MODEL_PATH = "./pong-cnn_D_RULE"  # Models path for saving or loading
Teacher_path = "pong-cnn-"
SAVE_MODEL_INTERVAL = 50  # Save models at every X epoch
TRAIN_MODEL = True  # Train model while playing (Make it False when testing a model)

LOAD_MODEL_FROM_FILE = False  # Load model from file
LOAD_FILE_EPISODE_Teacher = 800 #900  # Load Xth episode from file
LOAD_FILE_EPISODE = 0 #300

BATCH_SIZE = 64  # Minibatch size that select randomly from mem for train nets
MAX_EPISODE = 1201 #100000  # Max episode
MAX_STEP = 100000  # Max step size for one episode

MAX_MEMORY_LEN = 25000  # Max memory len
MIN_MEMORY_LEN = 10000 #40000  # Min memory len before start train

GAMMA = 0.97  # Discount rate
ALPHA = 0.00025  # Learning rate
EPSILON_DECAY = 0.99  # Epsilon decay rate by step

RENDER_GAME_WINDOW = False #True  # Opens a new window to render the game (Won't work on colab default)


def prep_batch_data(state, action, reward, next_state, done):
    #state = np.concatenate(state)
    #next_state = np.concatenate(next_state)

    # Convert them to tensors
    state = torch.tensor(state, dtype=torch.float, device=DEVICE)
    next_state = torch.tensor(next_state, dtype=torch.float, device=DEVICE)
    action = torch.tensor(action, dtype=torch.long, device=DEVICE)
    reward = torch.tensor(reward, dtype=torch.float, device=DEVICE)
    done = torch.tensor(done, dtype=torch.float, device=DEVICE)
    return state, action, reward, next_state, done


if __name__ == "__main__":
    environment = gym.make(ENVIRONMENT)  # Get env   
    agent = Agent(environment)  # Create Agent
    attacker = Attacker(environment,11,7,15,1) 
    
    ## Initialize and load weight for pretrained Teacher Network
    teacher = c(agent) #.copy()
    teacher.online_model.load_state_dict(torch.load(Teacher_path+str(LOAD_FILE_EPISODE_Teacher)+".pkl"))
    with open(Teacher_path+str(LOAD_FILE_EPISODE_Teacher)+'.json') as outfile:
            param = json.load(outfile)
            teacher.epsilon = param.get('epsilon')
    
    if LOAD_MODEL_FROM_FILE:
        agent.online_model.load_state_dict(torch.load(MODEL_PATH+str(LOAD_FILE_EPISODE)+".pkl"))
        attacker.online_model.load_state_dict(torch.load(MODEL_PATH+str(LOAD_FILE_EPISODE)+ "_atk_" +".pkl"))

        with open(MODEL_PATH+str(LOAD_FILE_EPISODE)+'.json') as outfile:
            param = json.load(outfile)
            agent.epsilon = param.get('epsilon')
            attacker.epsilon = param.get('epsilon')

        startEpisode = LOAD_FILE_EPISODE + 1

    else:
        startEpisode = 1

    last_100_ep_reward = deque(maxlen=100)  # Last 100 episode rewards
    total_step = 1  # Cumulkative sum of all steps in episodes
    ATT_FLAG = False
    proxy_memory = deque(maxlen=BATCH_SIZE+1)
    teacher_act_list = list()
    for episode in range(startEpisode, MAX_EPISODE):

        startTime = time.time()  # Keep time
        state = environment.reset()  # Reset env

        state = agent.preProcess(state)  # Process image

        # Stack state . Every state contains 4 time contionusly frames
        # We stack frames like 4 channel image
        state = np.stack((state, state, state, state))

        total_max_q_val = 0  # Total max q vals
        total_reward = 0  # Total reward for each episode
        total_loss = 0  # Total loss for each episode
        num_action = 0
        count_target_state = 0
        reward_count = 0
        reward_state_count = 0.001
        for step in range(MAX_STEP):

            # Select and perform an action
            action = agent.act(state)  # Act
            
            # check the action of an optimal Q network at current state and compare with learner's action
            teacher_action = teacher.act(state)
            if (teacher_action != 0):
                count_target_state += 1
                if (action == 0):
                    num_action += 1
    
            next_state, reward, done, info = environment.step(action)  # Observe

            next_state = agent.preProcess(next_state)  # Process image

            # Stack state . Every state contains 4 time contionusly frames
            # We stack frames like 4 channel image
            next_state = np.stack((next_state, state[0], state[1], state[2]))
            
            
            # Attacker perform it's action by publishing a reward stored in it's disk
            return_reward = attacker.step_one(0, state, next_state, action, reward, done)
            
            # Learner uses the publish reward and store the latest transition tuple in it's replay buffer
            if return_reward is not None:
                    # Store the poisoned transition in memory
                    agent.storeResults(state, action, return_reward, next_state, done)  # Store to mem
                
                    # Store the latest transition tuple and teacher's action to a proxy memory to be used later to compute attacker's reward
                    proxy_memory.append((state, action, return_reward, next_state, done))
                    teacher_act_list.append(teacher_action)
                    
            else:
                #pass
                # Not used after intial few steps of learner's interaction with the environment
                agent.storeResults(state, action, 0, next_state, done)  # Store to mem
                proxy_memory.append((state, action, 0, next_state, done))
                teacher_act_list.append(teacher_action)


            # Move to the next state
            state = next_state  # Update state

            # Main Training Loop
            if TRAIN_MODEL:
                # Agent extract Batch data from replay buffer
                loss, max_q_val, state_Q_val, update_agent, state_inf, next_state_inf, action_inf, reward_inf, done_inf = agent.extract_data(agent)  
                
                # check whether buffer size is more than min_replay_buffer size, if not, then don't start training
                if update_agent == False:
                    pass
                # train
                else:
                    
                    # Agent model in trained and updated with the extracted data from buffer
                    loss, max_q_val = agent.train(agent, state_inf, next_state_inf, action_inf, reward_inf, done_inf)
                     
            else:
                loss, max_q_val = [0, 0]
            # variables to print the output 
            total_loss += loss
            total_max_q_val += max_q_val
            total_reward += reward
            total_step += 1
            # update the greedy epsilon value adaptively 
            if total_step % 1000 == 0:
                agent.adaptiveEpsilon()  # Decrase epsilon
                attacker.adaptiveEpsilon()
      
            if done:  # Episode completed
                currentTime = time.time()  # Keep current time
                time_passed = currentTime - startTime  # Find episode duration
                current_time_format = time.strftime("%H:%M:%S", time.gmtime())  # Get current dateTime as HH:MM:SS
                epsilonDict = {'epsilon': agent.epsilon}  # Create epsilon dict to save model as file

                # Save the models attacker and learner after completion of Save_model_interval number of episodes
                if SAVE_MODELS and episode % SAVE_MODEL_INTERVAL == 0:  # Save model as file
                    weightsPath = MODEL_PATH + str(episode) + '.pkl'
                    weightsPath_atk = MODEL_PATH + str(episode) + "_atk_" + '.pkl'
                    epsilonPath = MODEL_PATH + str(episode) + '.json'

                    torch.save(agent.online_model.state_dict(), weightsPath)
                    torch.save(attacker.online_model.state_dict(), weightsPath_atk)
                    with open(epsilonPath, 'w') as outfile:
                        json.dump(epsilonDict, outfile)
                # After every episode update the target network of learner and attacker
                if TRAIN_MODEL:
                    agent.target_model.load_state_dict(agent.online_model.state_dict())  # Update target model
                    attacker.target_model.load_state_dict(attacker.online_model.state_dict())
                # Logging
                last_100_ep_reward.append(total_reward)
                avg_max_q_val = total_max_q_val / step
                # printing values
                outStr = "Episode:{} Time:{} Reward:{:.2f} Loss:{:.2f} Last_100_Avg_Rew:{:.3f} Avg_Max_Q:{:.3f} Epsilon:{:.2f} Duration:{:.2f} Step:{} CStep:{} action_num:{} frac:{} ".format(
                    episode, current_time_format, total_reward, total_loss, np.mean(last_100_ep_reward), avg_max_q_val, agent.epsilon, time_passed, step, total_step, num_action, num_action/count_target_state,
                )

                print(outStr)

                #if SAVE_MODELS:
                if True:
                    outputPath = MODEL_PATH + "out" + '.txt'  # Save outStr to file
                    with open(outputPath, 'a') as outfile:
                        outfile.write(outStr+"\n")

                break

