In [14]:
import yaml
from os.path import isfile
from itertools import combinations
from scipy.stats import mannwhitneyu, binomtest, tmean, tstd

from lib.utils import * # Import custom utility functions

SAVE_METRICS_PATH = 'saved/evaluation_metrics.yml'

if isfile(SAVE_METRICS_PATH):
    with open(SAVE_METRICS_PATH) as file:
        ALL_MODEL_METRICS = yaml.load(file, Loader=yaml.FullLoader)

MODEL_LIST = list(ALL_MODEL_METRICS.keys())

### Statistical Test: Mann-Whitney U test
Purpose:
* To test if distributions of `step_array` obtained from 2 different RL Model evaluations are different

Null hypothesis,
$H_0:$ The two sample populations have the same distribution of scores.

References:
* https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html
* https://psych.unl.edu/psycrs/handcomp/hcmann.PDF

In [18]:
# Calculate Pairwise Test Statistics & P-Values

threshold = 0.05 # if p-value >= 0.05 -> we do not have enough evidence to reject Null Hypothesis

for model1, model2 in list(combinations(MODEL_LIST, 2)):
    model1_step_array = ALL_MODEL_METRICS[model1]['step_array'] # control
    model2_step_array = ALL_MODEL_METRICS[model2]['step_array'] # treatment
    stat_test = mannwhitneyu(model2_step_array, model1_step_array, alternative='less')

    reject_h0 = stat_test.pvalue < threshold

    print(f"""{model1} - {model2}:\n
      Test Statistic: {stat_test.statistic}\n  
      P-Value: {stat_test.pvalue}\n
      Reject Null Hypothesis: {reject_h0}
    """)


DQN_A - DQN_B:

      Test Statistic: 6196.5
  
      P-Value: 1.185092707467001e-24

      Reject Null Hypothesis: True
    
DQN_A - DQN_C:

      Test Statistic: 5379.5
  
      P-Value: 0.9999999074681678

      Reject Null Hypothesis: False
    
DQN_A - DQN_D:

      Test Statistic: 4513.5
  
      P-Value: 1.8670304834384237e-10

      Reject Null Hypothesis: True
    
DQN_A - PPO_A:

      Test Statistic: 8896.5
  
      P-Value: 4.700727130409219e-20

      Reject Null Hypothesis: True
    
DQN_A - PPO_B:

      Test Statistic: 7227.0
  
      P-Value: 1.443004744874997e-23

      Reject Null Hypothesis: True
    
DQN_A - PPO_C:

      Test Statistic: 9248.5
  
      P-Value: 1.0963096658219112e-18

      Reject Null Hypothesis: True
    
DQN_A - PPO_D:

      Test Statistic: 9562.5
  
      P-Value: 5.835789109594936e-19

      Reject Null Hypothesis: True
    
DQN_A - RPPO_A:

      Test Statistic: 132.0
  
      P-Value: 0.0001924521132761271

      Reject Null Hypothesis: Tr

### Statistical Test: Vargha and Delaney's (A12) statistic

Reference: https://gist.github.com/jacksonpradolima/f9b19d65b7f16603c837024d5f8c8a65

In [3]:
from lib.stats import *

In [12]:
rewards1 = ALL_MODEL_METRICS['PPO_D']['all_reward_array'] # treatment set
rewards2 = ALL_MODEL_METRICS['DQN_D']['all_reward_array'] # control set
print(f"Reward1 - Mean: {tmean(rewards1):.3f}, Std Var: {tstd(rewards1):.3f}")
print(f"Reward2 - Mean: {tmean(rewards2):.3f}, Std Var: {tstd(rewards2):.3f}")
print(VD_A(rewards1, rewards2))

Reward1 - Mean: 88.284, Std Var: 32.552
Reward2 - Mean: 23.749, Std Var: 48.228
(0.7953875, 'large')


In [13]:
rewards1 = ALL_MODEL_METRICS['RPPO_D']['all_reward_array'] # treatment set
rewards2 = ALL_MODEL_METRICS['DQN_D']['all_reward_array'] # control set
print(f"Reward1 - Mean: {tmean(rewards1):.3f}, Std Var: {tstd(rewards1):.3f}")
print(f"Reward2 - Mean: {tmean(rewards2):.3f}, Std Var: {tstd(rewards2):.3f}")
print(VD_A(rewards1, rewards2))

Reward1 - Mean: 0.767, Std Var: 31.129
Reward2 - Mean: 23.749, Std Var: 48.228
(0.3127595, 'medium')


In [11]:
rewards1 = ALL_MODEL_METRICS['A2C_D']['all_reward_array'] # treatment set
rewards2 = ALL_MODEL_METRICS['DQN_D']['all_reward_array'] # control set
print(f"Reward1 - Mean: {tmean(rewards1):.3f}, Std Var: {tstd(rewards1):.3f}")
print(f"Reward2 - Mean: {tmean(rewards2):.3f}, Std Var: {tstd(rewards2):.3f}")
print(VD_A(rewards1, rewards2))

Reward1 - Mean: 62.600, Std Var: 51.246
Reward2 - Mean: 23.749, Std Var: 48.228
(0.6672425, 'medium')
