# Invader Defender 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
from scipy.optimize import linprog

In [2]:
actions = [[-1, 0], [0, 1], [1, 0], [0, -1]] #up, right, down, left = (clockwise from up) 
action_count = len(actions) 
gridSize = 6 
state_count = gridSize*gridSize

In [3]:
class Invader_Defender():
    def __init__(self, gridSize):
        self.valueMap = np.zeros((gridSize, gridSize))
        self.states = [[i, j] for i in range(gridSize) for j in range(gridSize)]
        self.size = gridSize
        
        # deterministic transition ?
        self.transition_prob = 1 
        
        # initialize defender and invader states
        self.new_state = [0, 0, 0, 0]
        self.new_defender_state = [0, 0]
        self.new_invader_state = [0, 0]
        
        # set territory state
        self.territory_state = [4, 4]

        # create a list of all possible states in the game
        self.game_state_list = []
        for defender_state in self.states:
            for invader_state in self.states:
                combined_states = defender_state + invader_state
                self.game_state_list.append(combined_states)
        
        # create 2 lists of states representing defender and invader victory
        self.defender_won = []
        self.invader_won = []
        
        # create states representing defender victory
        for defender_state in self.states:
            for invader_state in self.states:
                distance = np.linalg.norm(np.array(defender_state) - np.array(invader_state))
                # if the invader is not at territory and within the capture range of defender = defender won
                if invader_state != self.territory_state and distance <= np.sqrt(2):
                    combined_states = defender_state + invader_state
                    self.defender_won.append(combined_states)
           
        # create states representing invader victory
        for defender_state in self.states:
            distance = np.linalg.norm(np.array(defender_state) - np.array(self.territory_state))
            # if the invader is at territory, and outside of the defender's capture range = invader won
            if distance > np.sqrt(2):
                combined_states = defender_state + self.territory_state
                self.invader_won.append(combined_states)
    def initial_states(self):
        "returns initial states of invader and defender"
        defender_state = [0,0]
        invader_state = [gridSize-1,0]
        return defender_state+invader_state    
    
    def possible_states(self):
        """
        A function that returns a list of all possible states in the game
        """
        return self.game_state_list
    
    def terminal_check(self, state):
        """
        A function that checks whether the game is at a terminal state.
        Terminal state happens when either the invader or defender has won.
        """
        if state in self.defender_won:
            status = "Defender Won"
            terminal_check = True
        elif state in self.invader_won:
            status = "Invader Won"
            terminal_check = True
        else:
            terminal_check = False
            status = "Game in Progress"

        return terminal_check, status
    
#     def transition_probability(self, transition):
#         """
#         A function that returns the transition probability...?
#         """
#         return self.transition_prob, reward

    def next_state(self, current_state, defender_action, invader_action):
        """
        A function that returns the next state
        Input: current state [0,0] , defender_action [0, 1], invader_action [0,-1]
        Output: next state array([x1,y1,x2,y2]) and reward (int)
            - If the action takes the agent off grid, the agent remains in original state
            - If defender won, reward is calculated based on manhattan distance between invader captured state
            and territory
            - If defender loss, reward is -100
        """
        defender_state = []
        invader_state = []
        
        # deconstruct current state [0,0,1,1] in to defender [0,0] and invader [1,1] state
        for i in range(4):
            if i < 2:
                defender_state.append(current_state[i])
            else:
                invader_state.append(current_state[i])
                
        # get next state: state: [0, 0], action: [0, 1], new_state = [0, 1]
        self.new_defender_state = list(np.array(defender_state) + np.array(defender_action))
        self.new_invader_state = list(np.array(invader_state) + np.array(invader_action))

        # if new defender states results in off the grid, return to original state
        if -1 in self.new_defender_state or self.size in self.new_defender_state:
            self.new_defender_state = defender_state
        
        # if new invader states results in off the grid, return to original state
        if -1 in self.new_invader_state or self.size in self.new_invader_state:
            self.new_invader_state = invader_state
       
        # combine the defender and invader state
        self.new_state = self.new_defender_state
        self.new_state.extend(self.new_invader_state)
        
        # calculate rewards
        terminal, status = self.terminal_check(self.new_state)
        if terminal == True:
            if status == "Defender Won":
                # defender reward if defender won (manhattan distance between invader captured state and territory)
                distance_to_territory = sum(abs(np.array(self.new_invader_state) - np.array(self.territory_state)))
                self.reward = distance_to_territory
            else:
                # defender reward if invader won
                self.reward = -100
        else:
            self.reward = 0
            
        return self.new_state, self.reward

## Testing 

In [4]:
invader_defender = Invader_Defender(6)

In [5]:
next_state, reward = invader_defender.next_state([2,1,0,0], [-1, 0], [-1, 0])

In [6]:
next_state

[1, 1, 0, 0]

In [7]:
reward

8

In [8]:
invader_defender.terminal_check([1, 1, 0, 0])

(True, 'Defender Won')

## Initialization and Function Definitions

In [9]:
invader_defender = Invader_Defender(6)

In [10]:
k = 0
U = {}
gamma = 0.9
state_list = []
listofzeros = [0.0] * len(invader_defender.game_state_list)
delta_list = []

# convert game_state_list in to a state list of tuples in order to make a dictionary
for state in invader_defender.game_state_list:
    state_list.append(tuple(state))
    
# initiate params
G = dict(zip(state_list, listofzeros))
U[k] = dict(zip(state_list, listofzeros))

In [11]:
def calculate_payoff(state):
    """
    A function calculates the payoff of a specific state by iterating over every defender/invader action
    Input: state (ie. [0,0,1,1])
    Output: payoff = 4x4 matrix where each element represent the defender's payoff 
    when defender take i, and invader take action j
    """
    payoff = np.zeros([4,4])
    for i in range(action_count):
        defender_action = actions[i]
        for j in range(action_count):
            invader_action = actions[j]
            next_state, reward = invader_defender.next_state(state, defender_action, invader_action)
            payoff[i, j] = reward + gamma*invader_defender.transition_prob*U[k][tuple(next_state)]
    return payoff

In [12]:
def calculate_value(G_state):
    """
    A function that calculates the value of a game by using linear programming.
    The value is calculated in both the defender and invader's perspective which are equal in value
    and opposite in signs
    Input: payoff matrix of a particular state (4x4 matrix)
    Output: Value = scalar value of the game.
    """
    
    # defender lin prog
    c = [0, 0, 0, 0, -1]
    defender_q = -1*np.transpose(G_state)     
    v_coeff = np.ones((4,1))
    Aub = np.concatenate((defender_q,v_coeff),1)
    b = [0, 0, 0, 0]
    Aeq = [[1, 1, 1, 1, 0]]
    beq = [[1.]]
    bounds = ((0,1),(0,1),(0,1),(0,1),(None, None))
    defender_solution = linprog(c, A_ub=Aub, b_ub=b, A_eq=Aeq, b_eq=beq, bounds=bounds, method='simplex')
    
    # invader lin prog
    c = [0, 0, 0, 0, 1]
    invader_q = G_state
    w_coeff = np.ones((4,1))*-1
    Aub = np.concatenate((invader_q,w_coeff),1)
    invader_solution = linprog(c, A_ub=Aub, b_ub=b, A_eq=Aeq, b_eq=beq, bounds=bounds, method='simplex')
    
    defender_value = defender_solution['fun']*-1
    invader_value = invader_solution['fun']*-1
    
    return defender_value, invader_value

In [13]:
def equilibrium(G_state):
    """
    A function that obtains the policy for defender and invader
    The value is calculated in both the defender and invader's perspective which are equal in value
    and opposite in signs
    Input: payoff matrix of a particular state (4x4 matrix)
    Output: policy for defender and invader
    """
    
    # defender lin prog
    c = [0, 0, 0, 0, -1]
    defender_q = -1*np.transpose(G_state)     
    v_coeff = np.ones((4,1))
    Aub = np.concatenate((defender_q,v_coeff),1)
    b = [0, 0, 0, 0]
    Aeq = [[1, 1, 1, 1, 0]]
    beq = [[1.]]
    bounds = ((0,1),(0,1),(0,1),(0,1),(None, None))
    defender_solution = linprog(c, A_ub=Aub, b_ub=b, A_eq=Aeq, b_eq=beq, bounds=bounds, method='simplex')
    
    # invader lin prog
    c = [0, 0, 0, 0, 1]
    invader_q = G_state
    w_coeff = np.ones((4,1))*-1
    Aub = np.concatenate((invader_q,w_coeff),1)
    invader_solution = linprog(c, A_ub=Aub, b_ub=b, A_eq=Aeq, b_eq=beq, bounds=bounds, method='simplex')
    
    defender_policy = defender_solution['x'][:4]
    invader_policy = invader_solution['x'][:4]
    
    return defender_policy, invader_policy

# Minimax Q-learning

In [14]:
import warnings
import numpy as np
warnings.filterwarnings('ignore')

tolerance = 1e-6
delta = 1
k = 0
runs = 1
r=0
episodes = 300
alpha = 0.1
epsilon = 0.5
delta_list = []
agentNum = 2

#Initialize Q

while r < runs:
    Qj = np.zeros((invader_defender.size, invader_defender.size, invader_defender.size, invader_defender.size, action_count, action_count))
    delta = 0
    accumulatedRewards = [] #for collecting rewards
    for n in range(agentNum):
        accumulatedRewards.append(0.0)
    # initialize the next entry of the U dictionary
    for ep in range(episodes):
        jointState = invader_defender.initial_states()
        terminal = False
        steps = 0
        while terminal == False and steps<200:

        #1 get actions
            A = Qj[jointState[0]][jointState[1]][jointState[2]][jointState[3]]
            p, q = equilibrium(A)    #initial policy       
            actioni = np.argmax(p)   #choose actions
            actiond = np.argmax(q)
            #random action for exploration
            if np.random.binomial(1, epsilon) == 1:
                actiondex = np.random.choice(action_count)
                while actiondex == actiond:
                    actiondex = np.random.choice(action_count)
                actiond = actiondex
                actioniex = np.random.choice(action_count)
                while actioniex == actioni:
                    actioniex = np.random.choice(action_count)
                actioni = actioniex
         
        #2 find next state and reward
            new_jointState, reward = invader_defender.next_state(jointState, actions[actiond], actions[actioni])
            
        #3 Calculate the value of the game next state     
            
            A = Qj[new_jointState[0]][new_jointState[1]][new_jointState[2]][new_jointState[3]]   
            p2, q2 = equilibrium(A)
            
            #Value of the game to update Qj v=pT*A*q
            ai = np.transpose(p2)
            dotai=np.dot(ai,A)
            new_qj= np.dot(dotai,q2)
        
        #4 Update Qj 
            #backup old qj
            qj = Qj[jointState[0]][jointState[1]][jointState[2]][jointState[3]][actions[actiond]][actions[actioni]]
        
            Qj[jointState[0]][jointState[1]][jointState[2]][jointState[3]][actions[actiond]][actions[actioni]] = (1-alpha)*qj + (alpha)*(-1*reward + gamma*new_qj)
            
            #old_delta = delta
            #delta = max(old_delta, abs(q1-new_q1))
            #delta_list.append(delta)
         
        #5 Update new joinState
            jointState = new_jointState
            steps =+ 1
        #6 check terminal condition
            terminal, status = invader_defender.terminal_check(jointState)
            
            if steps < 200:
                print(status)
            else:
                print("max steps exceed")
        
        #Reset for testing
        terminal = False
        jointState = invader_defender.initial_states()
        #count for discounted rewards
        count = 0

        #Run testing 
        while terminal == False and count < 200:
    
        #initial join state 
        
        #1: get actions 
            A = Qj[jointState[0]][jointState[1]][jointState[2]][jointState[3]]
            p, q = equilibrium(A)         
            actioni = np.argmax(p)   
            actiond = np.argmax(q)
        #2: Find nextstate and reward 
            new_jointState, reward = invader_defender.next_state(jointState, actions[actiond], actions[actioni])
            
        #3: Collect reward
            for a in range(agentNum):
                if a == 0:
                    accumulatedRewards[a] =+ (gamma**count)*reward
                else: 
                    accumulatedRewards[a] =+ (gamma**count)*-1*reward
        #4: Update state
            jointState = new_jointState

            
        #5: check terminal condition
            terminal, status  = invader_defender.terminal_check(jointState)
            count =+1 
            if count < 200:
                print(status)
            else:
                print("max steps exceed")
            
    
    r=+1
print("rewards 1", accumulatedRewards[0])
print("rewards 2", accumulatedRewards[1])

Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Defender Won
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
Game in Progress
G

KeyboardInterrupt: 

In [None]:
# policy extraction
defender_policy = {}
invader_policy = {}

for state in invader_defender.game_state_list:    
    G[tuple(state)] = calculate_payoff(state)
    defender_policy[tuple(state)], invader_policy[tuple(state)] = equilibrium(G[tuple(state)])

### Save Results to Pickle 

In [None]:
import pickle

# takes ~ 50 minutes (132 iterations) to converge to within tolerance, 
# so I am saving the learned U and G as a pickle
# to load them up faster (for development purpose)

with open('U.pickle', 'wb') as handle:
    pickle.dump(U, handle)

with open('G.pickle', 'wb') as handle:
    pickle.dump(G, handle)

In [None]:
# # uncomment to load U and G

# with open ('U.pickle', 'rb') as handle:
#     test_U = pickle.load(handle)
    
# with open ('G.pickle', 'rb') as handle:
#     test_G = pickle.load(handle)

# # converged k. This gives the last update to U dict
# k = 132

## Plot Delta 

In [None]:
plt.plot(delta_list)
plt.title('Iteration vs Delta')
plt.xlabel('Iteration')
plt.xticks(np.arange(0, k, k/10))
plt.ylabel('Delta')

## Defender and Invader Policy 

In [None]:
defender_policy

In [None]:
invader_policy