# Replicating Calvano et al. (2020)
## Baseline - Optimality
### Author: Andréa Epivent

In [1]:
# Set working directory
import os
path = os.getcwd()

# Import packages
exec(open(path+"/packages.py").read())

# Import custom functions
from functions import *

# Import parameters
exec(open(path+"/parameters.py").read())

In [2]:
# Import data from training
q_table_1 = np.load(path+'/Output/Baseline/q_table_a1.npy')
q_table_2 = np.load(path+'/Output/Baseline/q_table_a2.npy')
A = np.load(path+'/Output/Baseline/actions.npy')
S = np.load(path+'/Output/Baseline/states.npy')
conv_info = np.load(path+'/Output/Baseline/conv_info.npy')
price1 = np.load(path+'/Output/Baseline/price1.npy')
price2 = np.load(path+'/Output/Baseline/price2.npy')

### Computing optimality measure for both agents

* Agent 1

In [5]:
optim_a1 = []

for j in range(n_episodes): 
    
    q1 = q_table_1[(j+1)*225:(j+1)*225+225,:]
    q2 = q_table_2[(j+1)*225:(j+1)*225+225,:]
    
    # Initialize Q-matrix of agent
    q_table = init_Q(A)
        
    # Find last state and optimal action response according to limit strategy
    s_optim = find_rowindex(S,price1[j],price2[j])
    a_optim = np.argmax(q1[s_optim])
      
    # Loop over every state/action until convergence
    for act in A:
        
        # Initialize convergence criteria
        count_convergence = 0
        convergence = False

        while convergence == False:

            p1,p2 = act, A[np.argmax(q2[s_optim])] # Q-matrix of agent 2 doesn't change, play according to limit strategy
            next_state = find_rowindex(S,p1,p2) # We find the row index associated with these two new prices
            action = np.where(A == p1)[0][0] # get index associated to p1

            # Rewards
            reward = profit_compute(p1,p2)

            # Updating Q-table - for agent 1 only
            old_value = q_table[s_optim, action]
            next_max = np.max(q_table[next_state])

            new_value = (1 - alpha) * old_value + alpha * (reward + delta * next_max)
            q_table[s_optim, action] = new_value

            # We always stick to the same state
            #state = next_state

            diff = abs(old_value-new_value)

            if diff < 1e-4:
                count_convergence += 1
            else:
                count_convergence = 0

            if count_convergence == 100: # doesn't change for at least 100 iterations
                convergence = True
                
    optim_a1.append(q1[s_optim,a_optim]/np.max(q_table[s_optim])) # compare limit strategy and theoretical Q-matrix

* Agent 2

In [6]:
optim_a2 = []

for j in range(n_episodes): 
    
    q1 = q_table_1[(j+1)*225:(j+1)*225+225,:]
    q2 = q_table_2[(j+1)*225:(j+1)*225+225,:]
    
    # Initialize Q-matrix of agent
    q_table = init_Q(A)
        
    # Find last state and optimal action response according to limit strategy
    s_optim = find_rowindex(S,price1[j],price2[j])
    a_optim = np.argmax(q2[s_optim])
      
    # Loop over every state/action until convergence
    for act in A:
        
        # Initialize convergence criteria
        count_convergence = 0
        convergence = False

        while convergence == False:

            p1,p2 = A[np.argmax(q1[s_optim])], act # Q-matrix of agent 1 doesn't change, play according to limit strategy
            next_state = find_rowindex(S,p1,p2) # We find the row index associated with these two new prices
            action = np.where(A == p2)[0][0] # get index associated to p2

            # Rewards
            reward = profit_compute(p2,p1)

            # Updating Q-table - for agent 2 only
            old_value = q_table[s_optim, action]
            next_max = np.max(q_table[next_state])

            new_value = (1 - alpha) * old_value + alpha * (reward + delta * next_max)
            q_table[s_optim, action] = new_value

            # We always stick to the same state
            #state = next_state

            diff = abs(old_value-new_value)

            if diff < 1e-4:
                count_convergence += 1
            else:
                count_convergence = 0

            if count_convergence == 100: # doesn't change for at least 100 iterations
                convergence = True
                
    optim_a2.append(q2[s_optim,a_optim]/np.max(q_table[s_optim])) # compare limit strategy and theoretical Q-matrix

### Descriptive statistics

In [7]:
np.mean(optim_a1)

0.9889157639115969

In [8]:
# Fraction of Nash equilibrium played by agent 1
a1_t1 = np.where(abs(np.array(optim_a1)-1)<0.001)[0]
a1_t2 = np.where(abs(np.array(optim_a1)-1)<0.01)[0]
a1_t3 = np.where(abs(np.array(optim_a1)-1)<0.05)[0]

In [9]:
np.mean(optim_a2)

0.9865311879646961

In [10]:
# Fraction of Nash equilibrium played by agent 2
a2_t1 = np.where(abs(np.array(optim_a2)-1)<0.001)[0]
a2_t2 = np.where(abs(np.array(optim_a2)-1)<0.01)[0]
a2_t3 = np.where(abs(np.array(optim_a2)-1)<0.05)[0]

In [11]:
# Fraction of Nash equilibrium played by both agents
a1_a2_t1 = np.intersect1d(a1_t1,a2_t1)
a1_a2_t2 = np.intersect1d(a1_t2,a2_t2)
a1_a2_t3 = np.intersect1d(a1_t3,a2_t3)

In [12]:
# Descriptive table
print(np.mean(optim_a1),a1_t1.shape,a1_t2.shape,a1_t3.shape)
print(np.mean(optim_a2),a2_t1.shape,a2_t2.shape,a2_t3.shape)
print(np.mean(np.concatenate((optim_a1,optim_a2))),a1_a2_t1.shape,a1_a2_t2.shape,a1_a2_t3.shape)

0.9889157639115969 (4,) (26,) (76,)
0.9865311879646961 (3,) (24,) (77,)
0.9877234759381466 (0,) (6,) (61,)
