# Invader Defender 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
import pickle
from scipy.optimize import linprog
import pandas as pd
import time
import random

# to remove warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
actions = [[-1, 0], [0, 1], [1, 0], [0, -1]] #up, right, down, left = (clockwise from up) 
action_count = len(actions) 
gridSize = 6 
state_count = gridSize*gridSize

In [3]:
class Invader_Defender():
    def __init__(self, gridSize):
        self.valueMap = np.zeros((gridSize, gridSize))
        self.states = [[i, j] for i in range(gridSize) for j in range(gridSize)]
        self.size = gridSize
        
        # deterministic transition ?
        self.transition_prob = 1 
        
        # initialize defender and invader states
        self.new_state = [0, 0, 0, 0]
        self.new_defender_state = [0, 0]
        self.new_invader_state = [0, 0]
        
        # set territory state
        self.territory_state = [4, 4]

        # create a list of all possible states in the game
        self.game_state_list = []
        for defender_state in self.states:
            for invader_state in self.states:
                combined_states = tuple(defender_state + invader_state)
                self.game_state_list.append(combined_states)
        
        # create 2 lists of states representing defender and invader victory
        self.defender_won = []
        self.invader_won = []
        
        # create states representing defender victory
        for defender_state in self.states:
            for invader_state in self.states:
                distance = np.linalg.norm(np.array(defender_state) - np.array(invader_state))
                # if the invader is not at territory and within the capture range of defender = defender won
                if invader_state != self.territory_state and distance <= np.sqrt(2):
                    combined_states = defender_state + invader_state
                    self.defender_won.append(combined_states)
           
        # create states representing invader victory
        for defender_state in self.states:
            distance = np.linalg.norm(np.array(defender_state) - np.array(self.territory_state))
            # if the invader is at territory, and outside of the defender's capture range = invader won
            if distance > np.sqrt(2):
                combined_states = defender_state + self.territory_state
                self.invader_won.append(combined_states)
    
    def possible_states(self):
        """
        A function that returns a list of all possible states in the game
        """
        return self.game_state_list
    
    def terminal_check(self, state):
        """
        A function that checks whether the game is at a terminal state.
        Terminal state happens when either the invader or defender has won.
        """
        if state in self.defender_won:
            status = "Defender Won"
            terminal_check = True
        elif state in self.invader_won:
            status = "Invader Won"
            terminal_check = True
        else:
            terminal_check = False
            status = "Game in Progress"

        return terminal_check, status
    
#     def transition_probability(self, transition):
#         """
#         A function that returns the transition probability...?
#         """
#         return self.transition_prob, reward

    def next_state(self, current_state, defender_action, invader_action):
        """
        A function that returns the next state
        Input: current state [0,0] , defender_action [0, 1], invader_action [0,-1]
        Output: next state array([x1,y1,x2,y2]) and reward (int)
            - If the action takes the agent off grid, the agent remains in original state
            - If defender won, reward is calculated based on manhattan distance between invader captured state
            and territory
            - If defender loss, reward is -100
        """
        defender_state = []
        invader_state = []
        
        # deconstruct current state [0,0,1,1] in to defender [0,0] and invader [1,1] state
        for i in range(4):
            if i < 2:
                defender_state.append(current_state[i])
            else:
                invader_state.append(current_state[i])
                
        # get next state: state: [0, 0], action: [0, 1], new_state = [0, 1]
        self.new_defender_state = list(np.array(defender_state) + np.array(defender_action))
        self.new_invader_state = list(np.array(invader_state) + np.array(invader_action))

        # if new defender states results in off the grid, return to original state
        if -1 in self.new_defender_state or self.size in self.new_defender_state:
            self.new_defender_state = defender_state
        
        # if new invader states results in off the grid, return to original state
        if -1 in self.new_invader_state or self.size in self.new_invader_state:
            self.new_invader_state = invader_state
       
        # combine the defender and invader state
        self.new_state = self.new_defender_state
        self.new_state.extend(self.new_invader_state)
        
#         # original rewards
#         terminal, status = self.terminal_check(self.new_state)
#         if terminal == True:
#             if status == "Defender Won":
#                 # defender reward if defender won (manhattan distance between invader captured state and territory)
#                 distance_to_territory = sum(abs(np.array(self.new_invader_state) - np.array(self.territory_state)))
#                 self.reward = distance_to_territory
#             else:
#                 # defender reward if invader won
#                 self.reward = -100
#         else:
#             self.reward = 0
            
        # new rewards: penalizing defender for every step that invader takes closer to territory
        terminal, status = self.terminal_check(self.new_state)
        if terminal == True:
            if status == "Defender Won":
                # defender reward if defender won (manhattan distance between invader captured state and territory)
                distance_to_territory = sum(abs(np.array(self.new_invader_state) - np.array(self.territory_state)))
                self.reward = distance_to_territory
            else:
                # defender reward if invader won
                self.reward = -100
        else:
            # penalize defender for every step that invader takes closer to territory
            distance_to_territory = sum(abs(np.array(self.new_invader_state) - np.array(self.territory_state)))
            self.reward = -(8 - distance_to_territory)
            
            
        return self.new_state, self.reward

## Testing 

In [4]:
invader_defender = Invader_Defender(6)

In [5]:
next_state, reward = invader_defender.next_state([2,1,0,0], [-1, 0], [-1, 0])

In [6]:
next_state

[1, 1, 0, 0]

In [7]:
reward

8

In [8]:
invader_defender.terminal_check([1, 1, 0, 0])

(True, 'Defender Won')

## Initialization and Function Definitions

In [9]:
invader_defender = Invader_Defender(6)

In [10]:
state_list = []
delta_list = []

In [11]:
def calculate_value(G_state):
    """
    A function that calculates the value of a game by using linear programming.
    The value is calculated in both the defender and invader's perspective which are equal in value
    and opposite in signs
    Input: payoff matrix of a particular state (4x4 matrix)
    Output: Value = scalar value of the game.
    """
    
    G_state = list(G_state)
    
    # defender lin prog
    c = [0, 0, 0, 0, -1]
    defender_q = -1*np.transpose(G_state)     
    v_coeff = np.ones((4,1))
    Aub = np.concatenate((defender_q,v_coeff),1)
    b = [0, 0, 0, 0]
    Aeq = [[1, 1, 1, 1, 0]]
    beq = [[1.]]
    bounds = ((0,1),(0,1),(0,1),(0,1),(None, None))
    defender_solution = linprog(c, A_ub=Aub, b_ub=b, A_eq=Aeq, b_eq=beq, bounds=bounds, method='simplex')
    
    # invader lin prog
    c = [0, 0, 0, 0, 1]
    invader_q = G_state
    w_coeff = np.ones((4,1))*-1
    Aub = np.concatenate((invader_q,w_coeff),1)
    invader_solution = linprog(c, A_ub=Aub, b_ub=b, A_eq=Aeq, b_eq=beq, bounds=bounds, method='simplex')
    
#     defender_value = defender_solution['fun']*-1
#     invader_value = invader_solution['fun']*-1
        
    # check if the linprog solution is successful or not
    if defender_solution['status'] == 0:
        value = defender_solution['fun']*-1
    else:
        value = invader_solution['fun']
    
    return value

In [12]:
def calculate_payoff(state):
    """
    A function calculates the payoff of a specific state based on Q values
    Input: state (ie. [0,0,1,1])
    Output: payoff = 4x4 matrix where each element represent the defender's payoff 
    when defender take i, and invader take action j
    """
    state = list(state)
    payoff = np.zeros([4,4])
    for i in range(action_count):
        defender_action = i
        for j in range(action_count):
            invader_action = j
            joint_action = [defender_action, invader_action]
            state_action_pair = state + joint_action
            payoff[i, j] = Q[tuple(state_action_pair)]

    return payoff

In [13]:
def equilibrium(G_state):
    """
    A function that obtains the policy for defender and invader
    The value is calculated in both the defender and invader's perspective which are equal in value
    and opposite in signs
    Input: payoff matrix of a particular state (4x4 matrix)
    Output: policy for defender and invader
    """
    
    # defender lin prog
    c = [0, 0, 0, 0, -1]
    defender_q = -1*np.transpose(G_state)     
    v_coeff = np.ones((4,1))
    Aub = np.concatenate((defender_q,v_coeff),1)
    b = [0, 0, 0, 0]
    Aeq = [[1, 1, 1, 1, 0]]
    beq = [[1.]]
    bounds = ((0,1),(0,1),(0,1),(0,1),(None, None))
    defender_solution = linprog(c, A_ub=Aub, b_ub=b, A_eq=Aeq, b_eq=beq, bounds=bounds, method='simplex')
    
    # invader lin prog
    c = [0, 0, 0, 0, 1]
    invader_q = G_state
    w_coeff = np.ones((4,1))*-1
    Aub = np.concatenate((invader_q,w_coeff),1)
    invader_solution = linprog(c, A_ub=Aub, b_ub=b, A_eq=Aeq, b_eq=beq, bounds=bounds, method='simplex')
    
    defender_policy = defender_solution['x'][:4]
    invader_policy = invader_solution['x'][:4]
    
    return defender_policy, invader_policy

In [14]:
def choose_action(defender_policy, invader_policy, epsilon):
    """
    A function that choose a joint epsilon-greedy action based on defender/invader policy
    Input: defender_policy (1x4), invader policy (1x4), and epsilon (ie. 0.3)
    Output: joint action index = [defender action index, invader action index] = [0 to 3, 0 to 3]
    """
       
    # choose an action type: explore (0) or exploit(1)
    action_type = int(np.random.choice(2, 1, p=[epsilon,1-epsilon]))
    
    # pick the best action
    best_defender_action_index = np.argmax(defender_policy)
    best_invader_action_index = np.argmax(invader_policy)
    
    if action_type == 0:
        
        # randomly pick an action
        random_defender_action_index = random.choice(range(4))    
        random_invader_action_index = random.choice(range(4))    

        # while random action is the same as the best action, pick a new action
        while random_defender_action_index == best_defender_action_index:
            random_defender_action_index = random.choice(range(4))
        defender_action_index = random_defender_action_index
        
        # while random action is the same as the best action, pick a new action
        while random_invader_action_index == best_invader_action_index:
            random_invader_action_index = random.choice(range(4))
        invader_action_index = random_invader_action_index
    
    else:
        defender_action_index = best_defender_action_index
        invader_action_index = best_invader_action_index
    
    joint_action = [defender_action_index, invader_action_index]
    
    return joint_action
    

## Minimax Q

In [49]:
# initialize params
t = 0
T = 30000
lr = 0.9
gamma = 0.1
epsilon = 0.9

defender_policy = {}
invader_policy = {}

In [50]:
# initialize Q matrix
state_action_pair_list = []

# create every possible state action pairs: 
# 1296 states * 4 defender actions * 4 invader actions = 20736 s,a pairs
for state in invader_defender.game_state_list:
    for defender_action in range(action_count):
        for invader_action in range(action_count):
            joint_action = [defender_action, invader_action]
            state_action_pair = list(state) + joint_action
            state_action_pair_list.append(tuple(state_action_pair))

In [51]:
# initialize a dictionary for Q values = {(x1, y1, x2, y2, defender_action_index, invader_action_index): q_value}
listofzeros = [0.0] * len(state_action_pair_list)
Q = dict(zip(state_action_pair_list, listofzeros))

# initialize a dictionary for G values = {(x1, y1, x2, y2): payoff_matrix}
listofzeros = [0.0] * len(invader_defender.game_state_list)
for state in invader_defender.game_state_list:
    state_list.append(state)
G = dict(zip(state_list, listofzeros))

# initialize states
defender_state = [5,0]
invader_state = [0,0]
current_state = tuple(defender_state + invader_state)

In [52]:
# build game based on Q value
G[current_state] = calculate_payoff(current_state)

In [53]:
G[current_state]

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [54]:
# choose a policy by solving the current game
defender_policy[current_state], invader_policy[current_state] = equilibrium(G[current_state] )

In [55]:
trajectory = []

while t < T:
    
    # keep track of trajectory
    trajectory.append(current_state)
    
    # choose a joint based on epsilon greedy (joint_action = [a1_indx, a2_indx])
    joint_action = choose_action(defender_policy[current_state], invader_policy[current_state], epsilon)
    current_state_action_pair = tuple(list(current_state) + joint_action) # ie. (x1, y1, x2, y2, a1_indx, a2_indx)
    
    # get next state and reward based on current state [x1,y1,x2,y2] and joint action [a1_indx, a2_indx]
    next_state, reward = invader_defender.next_state(current_state, actions[joint_action[0]], actions[joint_action[1]])
    next_state = tuple(next_state)
    
    # build a game based on next state: calculate payoff of next state
    G[next_state] = calculate_payoff(next_state)
    
    # generate a policy based on equilibirum of next game
    defender_policy[next_state], invader_policy[next_state] = equilibrium(G[next_state])
    
    # calculate the value of the next game
    value = calculate_value(G[next_state])
    
    # update Q[s,a] <- Q[s,a] + lr*(reward + gamma*value(s') - Q[s,a])
    Q[current_state_action_pair] = Q[current_state_action_pair] + lr*(reward + gamma*value - Q[current_state_action_pair])
    
    # set next state as current state
    current_state = next_state
    t+=1
    
    # print k and current max delta
    clear_output(wait=True)
    display('t: ' + str(t))

't: 30000'

In [46]:
# animate the trajectory
game_trajectory = trajectory

# define game dimensions
columns=range(invader_defender.size)
index = range(invader_defender.size)

# animate the game
for step in range(len(game_trajectory)):
    game_table = pd.DataFrame(0, index = index, columns=columns)
    game_table[4][4] = 'Ter.'
    game_table[game_trajectory[step][1]][game_trajectory[step][0]] = 'DEF'
    game_table[game_trajectory[step][3]][game_trajectory[step][2]] = 'INV'
    clear_output(wait=True)
    display(game_table, step)
    time.sleep(0.1)

Unnamed: 0,0,1,2,3,4,5
0,0,0,0,0,0,DEF
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,Ter.,INV
5,0,0,0,0,0,0


33

KeyboardInterrupt: 

In [56]:
# display all Q entries with non zero values
unique_counter = 0

for key in Q:
    if Q[key] != 0.0:
        print(key, Q[key])
        unique_counter += 1

(4, 3, 4, 3, 2, 3) 1.8
(5, 4, 4, 1, 2, 1) -5.4
(4, 4, 4, 1, 1, 1) -6.1425
(3, 0, 3, 4, 1, 2) -90.571725
(0, 1, 4, 3, 1, 0) -5.64948
(3, 4, 1, 5, 2, 0) -2.7
(5, 2, 1, 3, 0, 3) -2.97
(4, 5, 5, 1, 2, 0) -5.076976105308342
(3, 0, 3, 5, 2, 0) -4.5
(5, 3, 3, 2, 3, 0) -4.148543369705044
(5, 0, 5, 5, 2, 1) -5.4
(5, 3, 4, 5, 0, 2) -5.959605225348499
(5, 3, 3, 5, 0, 0) -5.339476180221683
(3, 4, 2, 5, 0, 3) 1.998
(1, 3, 2, 5, 1, 1) 2.8619999999999997
(2, 5, 0, 5, 0, 1) 4.95
(0, 5, 2, 2, 0, 1) -5.310676598768611
(1, 2, 1, 5, 3, 1) -3.7710268393175315
(2, 4, 2, 5, 0, 0) 3.996
(3, 4, 0, 2, 3, 1) -3.048057000000009
(0, 3, 3, 5, 0, 1) -6.5091200155756574
(5, 3, 0, 5, 0, 2) -4.518776077496076
(5, 4, 2, 5, 3, 1) -5.480258245673171
(1, 2, 4, 5, 0, 1) -7.653424987968401
(3, 1, 1, 4, 1, 1) -4.196451310048483
(2, 1, 2, 1, 1, 0) 5.4
(2, 2, 4, 3, 0, 2) -6.276458477677933
(5, 2, 2, 1, 3, 0) -1.998
(4, 4, 1, 3, 1, 1) -5.422881012597831
(2, 0, 4, 1, 3, 3) -3.6
(0, 3, 5, 4, 1, 3) -6.326112439651208
(1, 4, 1, 2, 1

(2, 0, 4, 5, 2, 2) -6.287453836150845
(2, 3, 0, 5, 3, 1) -3.0239999999999996
(5, 3, 2, 0, 3, 2) -2.7
(5, 4, 3, 4, 1, 1) -6.310240385667353
(1, 3, 4, 3, 0, 3) -5.4
(0, 5, 3, 3, 3, 1) -7.5961734741105476
(4, 4, 2, 1, 3, 3) -1.8
(0, 1, 0, 5, 1, 2) -4.4860500000000005
(1, 3, 4, 3, 1, 2) -5.574626385028175
(2, 5, 1, 1, 0, 3) -0.9
(2, 4, 4, 3, 2, 2) -5.994
(1, 5, 1, 4, 1, 3) -4.294026
(3, 3, 4, 1, 3, 2) -3.6
(2, 4, 1, 3, 1, 2) -5.2288605112598505
(3, 4, 4, 2, 1, 3) -4.995
(1, 1, 5, 2, 2, 2) -4.95
(5, 4, 2, 0, 2, 2) -2.997
(3, 2, 2, 4, 0, 1) -4.5
(2, 3, 0, 4, 2, 2) -5.359027437043282
(0, 4, 1, 5, 0, 2) -5.5341
(5, 5, 0, 5, 3, 0) -3.4165782908729407
(2, 5, 0, 2, 1, 0) -1.99998
(4, 3, 4, 1, 2, 2) -3.96
(1, 5, 1, 1, 2, 3) -0.99
(1, 4, 5, 3, 0, 0) -7.564898350934876
(4, 1, 2, 3, 0, 2) -5.4
(1, 4, 0, 4, 2, 2) 3.204366359693169
(5, 1, 1, 4, 0, 3) -3.6
(1, 5, 1, 0, 1, 1) -1.998
(3, 0, 4, 4, 1, 3) -6.93
(5, 2, 1, 5, 1, 2) -5.6012479286098165
(1, 4, 0, 3, 3, 0) 5.206189655172413
(4, 2, 4, 4, 1, 3) 0.3

(4, 3, 2, 0, 0, 0) -0.9
(1, 3, 1, 0, 2, 2) -1.98
(0, 4, 1, 5, 2, 0) 4.7101524675935975
(5, 2, 1, 2, 2, 2) -4.1126042163257415
(1, 1, 0, 1, 2, 2) 5.4
(3, 2, 4, 4, 1, 0) 0.9899999999999998
(3, 3, 5, 5, 2, 2) -5.925969688644007
(1, 4, 5, 2, 1, 0) -6.343146081012801
(2, 2, 4, 2, 2, 3) 2.7
(1, 4, 2, 4, 2, 3) 2.97
(2, 5, 4, 2, 2, 2) -4.5
(1, 5, 1, 1, 0, 1) -2.97
(3, 3, 5, 4, 1, 0) -7.956648415422372
(0, 0, 2, 3, 2, 2) -5.94
(4, 3, 2, 2, 1, 2) -5.140668602665379
(2, 2, 4, 3, 1, 1) -99.0
(2, 3, 2, 3, 2, 0) -3.6
(0, 1, 2, 4, 1, 1) -5.355992802475386
(5, 2, 0, 5, 3, 2) -4.121990279611184
(1, 4, 4, 0, 1, 2) -2.7
(5, 3, 1, 1, 0, 1) -3.0509999999999997
(4, 3, 3, 5, 2, 1) -6.395228678944985
(1, 0, 3, 5, 2, 2) -6.93
(2, 5, 2, 1, 0, 2) -4.176100983630376
(4, 2, 1, 3, 3, 2) -5.0346
(3, 4, 4, 5, 0, 0) 1.9998
(5, 2, 3, 5, 2, 2) -6.80577536096006
(3, 1, 5, 5, 2, 0) -7.642071566402382
(3, 4, 3, 1, 1, 0) -3.105
(3, 1, 5, 5, 1, 2) -5.94
(1, 1, 5, 3, 2, 3) -4.5
(4, 4, 5, 4, 1, 1) 2.066338424728653
(0, 4, 2, 4

(1, 3, 3, 1, 2, 3) -3.0780000000000003
(2, 5, 2, 2, 0, 2) -5.25049356773821
(1, 5, 5, 2, 0, 1) -6.62931
(5, 5, 1, 2, 1, 1) -4.484543511184906
(4, 4, 4, 5, 1, 0) 1.8584623007189747
(0, 4, 2, 5, 1, 1) -5.622106010592214
(5, 4, 1, 5, 1, 1) -4.204198991449662
(3, 2, 4, 2, 2, 0) 2.97
(2, 4, 0, 2, 1, 1) -3.124741469976456
(2, 1, 3, 5, 3, 3) -7.458420566037735
(3, 2, 0, 3, 3, 3) -1.8
(4, 3, 5, 2, 1, 2) -5.224905037381931
(4, 3, 3, 0, 2, 0) -1.8
(5, 4, 4, 5, 2, 0) -6.290918737852998
(1, 1, 1, 3, 1, 1) -5.150991155558094
(5, 2, 5, 4, 0, 0) -100.3167
(0, 3, 0, 0, 0, 2) -0.9
(4, 4, 2, 4, 3, 1) -5.345600317953653
(2, 0, 3, 2, 1, 2) -5.4
(2, 3, 4, 0, 3, 3) -3.6
(0, 2, 2, 1, 1, 3) -1.8
(0, 2, 5, 1, 3, 3) -2.7
(2, 0, 1, 4, 3, 3) -3.6
(5, 0, 4, 2, 0, 1) -6.3
(2, 3, 3, 2, 0, 3) -3.96
(1, 5, 0, 1, 1, 2) -1.998
(2, 1, 2, 4, 1, 0) -5.368060436893833
(5, 2, 4, 2, 1, 0) -5.144399999999998
(4, 2, 1, 5, 3, 0) -3.37392
(1, 3, 2, 4, 0, 1) -4.619733924611974
(5, 3, 4, 5, 2, 3) -8.265491569798101
(4, 2, 1, 4, 0, 

(5, 0, 5, 3, 2, 1) -7.454159105818299
(1, 1, 4, 4, 2, 1) -7.209357576366082
(2, 0, 4, 4, 1, 1) -6.604410230365159
(3, 3, 0, 2, 1, 2) -2.7
(0, 2, 0, 5, 1, 2) -4.004999999999999
(4, 3, 0, 4, 2, 2) -5.245981457677532
(2, 5, 4, 5, 3, 0) 2.013670860995568
(4, 5, 5, 3, 1, 2) -5.994
(4, 2, 2, 3, 1, 1) -6.426734237589195
(5, 2, 3, 3, 0, 1) -6.3
(5, 4, 3, 5, 1, 2) 0.6536883135288111
(1, 5, 1, 3, 2, 1) 3.001442591036278
(5, 2, 5, 4, 3, 0) -90.0
(4, 1, 0, 1, 0, 2) -1.8
(4, 2, 4, 0, 0, 0) -2.7
(0, 5, 1, 5, 3, 0) 5.419412065560781
(5, 5, 1, 5, 3, 3) -5.539814072228064
(2, 2, 0, 4, 3, 3) -2.7
(4, 0, 1, 4, 2, 1) -3.996
(0, 4, 1, 0, 3, 3) -0.9
(3, 4, 4, 3, 2, 0) 1.8
(5, 0, 0, 5, 2, 3) -3.6
(1, 0, 2, 4, 2, 0) -5.212728872923215
(0, 4, 1, 3, 2, 1) 3.1173518278461265
(5, 4, 5, 5, 3, 2) -5.87568046397618
(5, 3, 3, 3, 2, 2) 0.709866155139617
(4, 3, 3, 5, 2, 0) -4.995
(2, 0, 5, 4, 1, 2) -6.3
(5, 2, 4, 1, 2, 1) 1.8
(3, 4, 0, 4, 2, 1) -2.9997
(2, 3, 2, 5, 2, 1) -4.9999995
(1, 5, 1, 2, 3, 0) -1.99998
(3, 2, 3,

(2, 1, 5, 4, 0, 2) -6.3
(3, 5, 5, 4, 3, 1) -5.971203179498514
(4, 1, 5, 1, 1, 0) 2.7
(1, 4, 4, 4, 2, 2) -6.993
(5, 3, 5, 3, 1, 3) -4.717157219100771
(3, 1, 1, 2, 1, 2) 3.96
(5, 1, 2, 1, 3, 1) -3.96
(4, 3, 1, 0, 2, 1) -1.86075
(0, 1, 5, 4, 3, 3) -6.152971962616823
(5, 4, 1, 5, 2, 2) -5.484599999999999
(0, 5, 1, 5, 1, 2) -5.633275279975421
(5, 5, 0, 1, 1, 1) -2.075512461989349
(0, 0, 0, 3, 2, 0) -2.997
(4, 2, 3, 5, 1, 2) -6.9993
(0, 4, 1, 0, 1, 1) -1.86075
(1, 1, 0, 4, 2, 0) -3.96
(0, 5, 5, 0, 1, 2) -2.7
(5, 5, 4, 4, 1, 3) -7.3436331546570095
(2, 4, 3, 3, 1, 1) 0.42011777827770536
(0, 4, 1, 3, 0, 3) -3.309368333397412
(5, 5, 4, 5, 2, 1) 0.999
(4, 4, 5, 1, 2, 2) -3.6
(2, 3, 5, 0, 0, 2) -2.7
(3, 2, 2, 0, 1, 1) -2.7
(1, 4, 5, 5, 0, 3) -7.591891707529489
(4, 2, 4, 5, 2, 1) -7.740274953357188
(4, 5, 1, 2, 3, 0) -2.1224946478498206
(1, 3, 3, 1, 2, 1) 3.111967741935484
(3, 5, 0, 0, 1, 2) -0.9
(4, 5, 1, 3, 0, 2) -5.203915533029626
(1, 2, 2, 1, 2, 3) -1.8
(0, 1, 5, 5, 2, 0) -7.58975508836843
(0, 

(1, 5, 4, 4, 2, 0) 0.42586675197422974
(4, 1, 4, 3, 1, 0) 1.98
(1, 5, 0, 3, 0, 0) -3.1323758001396262
(1, 3, 5, 4, 2, 1) -6.640072746521383
(1, 5, 5, 4, 2, 2) -7.56234764367986
(4, 3, 0, 3, 3, 1) -4.397664449818622
(5, 2, 0, 1, 1, 2) -2.04075
(5, 3, 4, 5, 1, 2) 2.08791
(5, 3, 0, 0, 1, 2) -0.9
(1, 4, 5, 2, 3, 3) -3.96
(2, 5, 0, 5, 3, 3) -3.999996
(5, 4, 2, 3, 0, 3) -4.136788337813053
(2, 5, 3, 0, 1, 1) -4.124893067836967
(3, 4, 0, 3, 3, 1) -3.96
(5, 4, 5, 2, 2, 1) 1.998
(0, 4, 5, 3, 3, 0) -6.999993
(3, 4, 0, 2, 0, 3) -0.99
(3, 1, 3, 4, 1, 2) -90.0
(5, 5, 2, 3, 0, 0) -4.492763024195734
(0, 4, 5, 2, 0, 2) -4.995
(0, 5, 4, 4, 0, 3) -7.649380273995246
(5, 5, 0, 2, 2, 2) -3.1334110043591514
(4, 3, 2, 4, 0, 2) 0.99
(3, 2, 0, 2, 1, 1) -2.7
(2, 0, 4, 0, 3, 3) -3.6
(3, 5, 2, 2, 1, 2) -5.240672573840651
(2, 0, 3, 5, 3, 3) -7.165922330097088
(2, 1, 0, 1, 1, 2) 5.4
(2, 0, 1, 5, 2, 1) -3.96
(4, 5, 1, 5, 0, 3) -5.1748199999999995
(1, 4, 3, 3, 1, 2) -7.504869666283085
(1, 5, 5, 1, 3, 3) -2.7
(0, 2, 4,

In [57]:
print(unique_counter)

11707


In [58]:
# policy extraction
for state in invader_defender.game_state_list:    
    G[state] = calculate_payoff(state)
    defender_policy[state], invader_policy[state] = equilibrium(G[state])

In [62]:
defender_policy

{(0, 5, 5, 2): array([0.40976459, 0.0469097 , 0.        , 0.5433257 ]),
 (3, 3, 5, 4): array([0.        , 0.        , 0.45205479, 0.54794521]),
 (5, 1, 0, 3): array([0.27673182, 0.        , 0.11445817, 0.60881001]),
 (5, 3, 5, 0): array([0., 0., 0., 1.]),
 (3, 1, 2, 0): array([0., 0., 0., 1.]),
 (0, 4, 1, 2): array([0., 1., 0., 0.]),
 (3, 3, 1, 5): array([0.07050485, 0.92949515, 0.        , 0.        ]),
 (5, 4, 5, 0): array([0.39451947, 0.        , 0.        , 0.60548053]),
 (4, 4, 4, 1): array([0.        , 0.        , 0.50636371, 0.49363629]),
 (0, 2, 2, 4): array([0.40750376, 0.        , 0.59249624, 0.        ]),
 (3, 1, 2, 1): array([1., 0., 0., 0.]),
 (4, 1, 0, 0): array([0., 0., 0., 1.]),
 (1, 4, 4, 2): array([0.        , 0.40735726, 0.        , 0.59264274]),
 (0, 4, 5, 1): array([0., 0., 0., 1.]),
 (5, 1, 4, 1): array([0., 0., 0., 1.]),
 (2, 5, 0, 0): array([1., 0., 0., 0.]),
 (3, 5, 0, 4): array([0.19073471, 0.        , 0.80926529, 0.        ]),
 (1, 1, 0, 5): array([0.79263551

### Save Results to Pickle 

In [27]:
# # takes ~ 50 minutes (132 iterations) to converge to within tolerance, 
# # so I am saving the learned U and G as a pickle
# # to load them up faster (for development purpose)

# with open('U.pickle', 'wb') as handle:
#     pickle.dump(U, handle)

# with open('G.pickle', 'wb') as handle:
#     pickle.dump(G, handle)

In [28]:
# # uncomment to load U and G

# with open ('U.pickle', 'rb') as handle:
#     U = pickle.load(handle)
    
# with open ('G.pickle', 'rb') as handle:
#     G = pickle.load(handle)

# # converged k. This gives the last update to U dict
# k = 132

In [29]:
# # initialize policies
# defender_policy = {}
# invader_policy = {}
# state_counter = 0

# # policy extraction
# for state in invader_defender.game_state_list:    
#     G[state] = calculate_payoff(state)
#     defender_policy[state], invader_policy[state] = equilibrium(G[state])
#     state_counter += 1
#     clear_output(wait=True)
#     display('State: ' + str(state_counter))

## Heatmap

In [30]:
# # create a list of states that fixes the defender's starting position
# fixed_defender_state_list = []
# for invader_state in invader_defender.states:
#     fixed_defender_state = [5, 0] + invader_state
#     fixed_defender_state_list.append(fixed_defender_state)

# # create invader heatmap
# invader_map = np.zeros([6,6])
# for state in fixed_defender_state_list:
#     invader_map[state[2], state[3]] = U[k][tuple(state)]*-1 # -1 for invaders perspective

In [31]:
# # if the defender is fixed at the bottom left corner, this heatmap shows the invader's rewards
# plt.imshow(invader_map, interpolation='nearest')
# plt.colorbar()
# plt.title('Value Function from the Invader Perspective (Defender fixed at [5,0])')
# plt.show()

In [32]:
# # create a list of states that fixes the invaders's starting position
# fixed_invader_state_list = []
# for defender_state in invader_defender.states:
#     fixed_invader_state = defender_state + [0, 0]
#     fixed_invader_state_list.append(fixed_invader_state)

# # create invader heatmap
# defender_map = np.zeros([6,6])
# for state in fixed_invader_state_list:
#     defender_map[state[0], state[1]] = U[k][tuple(state)]

In [33]:
# # if invader is fixed at top left corner, this heatmap shows the defender's rewards
# plt.imshow(defender_map, interpolation='nearest')
# plt.colorbar()
# plt.title('Value Function from the Defender Perspective (Invader fixed at [0,0])')
# plt.show()

## Plot Delta 

In [34]:
# plt.plot(delta_list)
# plt.title('Iteration vs Delta')
# plt.xlabel('Iteration')
# plt.xticks(np.arange(0, k, k/10))
# plt.ylabel('Delta')

## Play a Game

In [88]:
# def generate_trajectory(Defender_state, Invader_state):
#     game_trajectory = []
#     terminal = False
#     current_state = tuple(Defender_state + Invader_state)
#     game_step = 0
    
#     # generate a game trajectory
#     while not terminal:

#         # append game trajectory
#         game_trajectory.append(current_state)
               
#         # check if game is terminal (someone won)
#         terminal, status = invader_defender.terminal_check(list(current_state))
        
#         # both agents choose action based on policy via sampling
#         invader_action = actions[int(np.random.choice(action_count, 1, p=invader_policy[tuple(current_state)]))]
#         defender_action = actions[int(np.random.choice(action_count, 1, p=defender_policy[tuple(current_state)]))]
        
#         # obtain next state
#         next_state, reward = invader_defender.next_state(list(current_state), defender_action, invader_action)
#         current_state = tuple(next_state)
        
#         game_step += 1
#         clear_output(wait=True)
#         display("game step: " + str(game_step))
        
    
#     return game_trajectory, status

### Animate the Game 

In [94]:
# # generate game trajectory
# game_trajectory, status = generate_trajectory([0,0],[0,5]) 

In [93]:
# # define game dimensions
# columns=range(invader_defender.size)
# index = range(invader_defender.size)

# # animate the game
# for step in range(len(game_trajectory)):
#     game_table = pd.DataFrame(0, index = index, columns=columns)
#     game_table[4][4] = 'Ter.'
#     game_table[game_trajectory[step][1]][game_trajectory[step][0]] = 'DEF'
#     game_table[game_trajectory[step][3]][game_trajectory[step][2]] = 'INV'
#     clear_output(wait=True)
#     display(game_table)
#     time.sleep(0.1)
    
# # print game status
# display(status)

Unnamed: 0,0,1,2,3,4,5
0,DEF,0,0,0,0,0
1,0,INV,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,Ter.,0
5,0,0,0,0,0,0


'Defender Won'