# Replicating Calvano et al. (2020)
## Baseline - Stat des - Convergence
### Author: Andréa Epivent

In [1]:
# Set working directory
import os
path = os.getcwd()

# Import packages
exec(open(path+"/packages.py").read())

# Import custom functions
from functions import *

# Import parameters
exec(open(path+"/parameters.py").read())

In [2]:
# Import data from training
q_table_1 = np.load(path+'/Output/Baseline/q_table_a1.npy')
q_table_2 = np.load(path+'/Output/Baseline/q_table_a2.npy')
A = np.load(path+'/Output/Baseline/actions.npy')
S = np.load(path+'/Output/Baseline/states.npy')
conv_info = np.load(path+'/Output/Baseline/conv_info.npy')

### Descriptive statistics

* Average number of iterations per episode

In [3]:
conv_info[0,:].mean()

1873408.56

* Number of iterations that did not converge

In [4]:
np.sum(conv_info[0,:] == criterion_final)

0

### Robustness

<b>Idea</b>: check that optimal actions don't change when we turn exploration off.

In [5]:
# Stock last price for both agents for each episode
price1, price2 = conv_info[1,:], conv_info[2,:]
threshold = 1e4
conv_episodes = []

for j in range(n_episodes): 
    
    t = 0
    
    # Import q-matrix of both agents
    q1 = q_table_1[(j+1)*225:(j+1)*225+225,:]
    q2 = q_table_2[(j+1)*225:(j+1)*225+225,:]
        
    # Find last state and optimal action response according to limit strategy
    state = find_rowindex(S,price1[j],price2[j])
        
    # Initialize convergence criteria
    convergence = False
    
    # Initialize matrix for keeping track of argmax_p q
    stab1 = np.full([state_space],-1)
    stab2 = np.full([state_space],-1)

    while convergence == False:

        # Find optimal actions and keep track
        action_a1 = np.argmax(q1[state])
        action_a2 = np.argmax(q2[state])
        
        # Retrieve prices and next state
        p1,p2 = A[action_a1], A[action_a2]
        next_state = find_rowindex(S,p1,p2) # We find the row index associated with these two new prices

        # Rewards
        reward_a1 = profit_compute(p1,p2)
        reward_a2 = profit_compute(p2,p1)

        # Updating Q-table - for agent 1 
        old_value_a1 = q1[state, action_a1]
        next_max_a1 = np.max(q1[next_state])

        new_value_a1 = (1 - alpha) * old_value_a1 + alpha * (reward_a1 + delta * next_max_a1)
        q1[state, action_a1] = new_value_a1
        
        # Updating Q-table - for agent 2
        old_value_a2 = q2[state, action_a2]
        next_max_a2 = np.max(q2[next_state])

        new_value_a2 = (1 - alpha) * old_value_a2 + alpha * (reward_a2 + delta * next_max_a2)
        q2[state, action_a2] = new_value_a2

        # Stop if optimal action changes
        if (action_a1 != stab1[state]) & (stab1[state] != -1):
            convergence = True
            print(f"Agent 1 changed optimal action at stage {state} and period {t}")
            conv_episodes.append(False)
        
        if (action_a2 != stab2[state]) & (stab2[state] != -1):
            convergence = True
            print(f"Agent 2 changed optimal action at stage {state} and period {t}")
        
        stab1[state] = action_a1
        stab2[state] = action_a2
        
        # We always stick to the same state
        state = next_state
        
        t += 1
        
        # Stop in any case
        if t == threshold:
            convergence = True
            conv_episodes.append(True)  

In [6]:
conv_episodes.count(True)

1000