In [50]:
import yaml
from os.path import isfile
from itertools import combinations
from scipy.stats import mannwhitneyu, binomtest

from lib.utils import * # Import custom utility functions

SAVE_METRICS_PATH = 'saved/evaluation_metrics.yml'

if isfile(SAVE_METRICS_PATH):
    with open(SAVE_METRICS_PATH) as file:
        ALL_MODEL_METRICS = yaml.load(file, Loader=yaml.FullLoader)

MODEL_LIST = list(ALL_MODEL_METRICS.keys())

### Statistical Test: Mann-Whitney U test
Purpose:
* To test if distributions of `step_array` obtained from 2 different RL Model evaluations are different

Null hypothesis,
$H_0:$ The two sample populations have the same distribution of scores.

References:
* https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html
* https://psych.unl.edu/psycrs/handcomp/hcmann.PDF

In [10]:
# Calculate Pairwise Test Statistics & P-Values

threshold = 0.05 # if p-value >= 0.05 -> we do not have enough evidence to reject Null Hypothesis

for model1, model2 in list(combinations(MODEL_LIST, 2)):
    model1_step_array = ALL_MODEL_METRICS[model1]['step_array']
    model2_step_array = ALL_MODEL_METRICS[model2]['step_array']
    stat_test = mannwhitneyu(model1_step_array, model2_step_array, alternative='two-sided')

    reject_h0 = stat_test.pvalue < threshold

    print(f"""{model1} - {model2}:\n
      Test Statistic: {stat_test.statistic}\n  
      P-Value: {stat_test.pvalue}\n
      Reject Null Hypothesis: {reject_h0}
    """)


DQN_A - DQN_B:

      Test Statistic: 45148.5
  
      P-Value: 2.370185414934002e-24

      Reject Null Hypothesis: True
    
DQN_A - DQN_C:

      Test Statistic: 1928.5
  
      P-Value: 1.8810262352977872e-07

      Reject Null Hypothesis: True
    
DQN_A - DQN_D:

      Test Statistic: 13630.5
  
      P-Value: 3.7340609668768475e-10

      Reject Null Hypothesis: True
    
DQN_A - PPO_A:

      Test Statistic: 47110.5
  
      P-Value: 9.401454260818438e-20

      Reject Null Hypothesis: True
    
DQN_A - PPO_B:

      Test Statistic: 48024.0
  
      P-Value: 2.886009489749994e-23

      Reject Null Hypothesis: True
    
DQN_A - PPO_C:

      Test Statistic: 44679.5
  
      P-Value: 2.1926193316438223e-18

      Reject Null Hypothesis: True
    
DQN_A - PPO_D:

      Test Statistic: 46948.5
  
      P-Value: 1.1671578219189872e-18

      Reject Null Hypothesis: True
    
DQN_A - RPPO_A:

      Test Statistic: 624.0
  
      P-Value: 0.0003849042265522542

      Reject Null Hypo

### Statistical Test: Vargha and Delaney's (A12) statistic

Reference: https://gist.github.com/jacksonpradolima/f9b19d65b7f16603c837024d5f8c8a65

In [11]:
from lib.stats import *

In [38]:
rewards1 = ALL_MODEL_METRICS['PPO_D']['all_reward_array'] # treatment set
rewards2 = ALL_MODEL_METRICS['DQN_D']['all_reward_array'] # control set

print(VD_A(rewards1, rewards2))

(0.7953875, 'large')


In [52]:
rewards1 = ALL_MODEL_METRICS['RPPO_D']['all_reward_array'] # treatment set
rewards2 = ALL_MODEL_METRICS['DQN_D']['all_reward_array'] # control set

print(VD_A(rewards1, rewards2))

(0.3127595, 'medium')
