In [16]:
import pandas as pd
import os, pathlib
from scipy.stats import kendalltau
import numpy as np

### SciPy definiton of Kendall's Tau: 
### $$\frac{P - Q}{\sqrt{(P + Q + T) * (P + Q + U)}}$$
#### where P is the number of concordant pairs, Q the number of discordant pairs, T the number of ties only in x, and U the number of ties only in y. If a tie occurs for the same pair in both x and y, it is not added to either T or U. (Note that ties will have been broken randomly by the calc_rank function below.) 
#### See https://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.stats.kendalltau.html for more information.
#### Should we use non-negative definition of KT instead?

In [17]:
def calc_rank(seed, y):

    '''
    Function adapted from https://www.geeksforgeeks.org/rank-elements-array/
    Randomly breaks ties using np random seed
    
    Copied from utils.stabililty_utils.py
    '''

    # Set random seed
    np.random.seed(seed)

    # Initialize rank vector 
    R = [0 for i in range(len(y))] 

    # Create an auxiliary array of tuples 
    # Each tuple stores the data as well as its index in y 
    # T[][0] is the data and T[][1] is the index of data in y
    T = [(y[i], i) for i in range(len(y))] 
    
    # Sort T according to first element 
    T.sort(key=lambda x: x[0], reverse=True)

    # Loop through items in T
    i=0
    while i < len(y): 

        # Get number of elements with equal rank 
        j = i 
        while j < len(y) - 1 and T[j][0] == T[j + 1][0]: 
            j += 1
        n = j - i + 1

        # If there is no tie
        if n==1:
            
            # Get ID of this element
            idx = T[i][1] 
            
            # Set rank
            rank = i+1
            
            # Assign rank
            R[idx] = rank 
            
        # If there is a tie
        if n>1: 
            
            # Create array of ranks to be assigned
            ranks = list(np.arange(i+1, i+1+n)) 
            
            # Randomly shuffle the ranks
            np.random.shuffle(ranks) 
            
            # Create list of element IDs
            ids = [T[i+x][1] for x in range(n)] 
            
            # Assign rank to each element
            for ind, idx in enumerate(ids):
                R[idx] = ranks[ind] 

        # Increment i 
        i += n 
    
    # return rank vector
    return R

In [35]:
out_dir = pathlib.Path(os.getcwd()).parents[1] / 'out'

s_samples = 200
n_runs = 500

# Highest eed not yet used in data gen
seed = s_samples*((n_runs+1)*3+2)


noise_kts = np.zeros([s_samples, n_runs])
noise_ps = np.zeros([s_samples, n_runs])

for s in range(1, s_samples+1):
    
    # Read in counterfactual data
    counter = pd.read_csv(out_dir/'counterfactual_data'/'stability'/'default'/'counter_samp_{}.csv'.format(s))
    
    # Get list of counterfactual Y columns
    counter_y_cols = [x for x in counter.columns if 'cf_y_' in x]
    
    # Calculate ranks for each counterfactual Y
    for y in counter_y_cols:
        counter['rank_'+y[5:]] = calc_rank(seed=seed, y=counter[y])
        seed += 1
    
    # Read in noise distribution data
    noise = pd.read_csv(out_dir/'synthetic_data'/'stability'/'default'/'rankings'/'observed_samp_{}.csv'.format(s))
    
    # Get original rank
    orig_rank = noise['rank']
    
    # Get KT distances between original rank and each rank from noise distribution
    # Average will give expected noise KT for this sample
    for n in range(1, n_runs+1):
        kt, p = kendalltau(orig_rank, noise['rank_{}'.format(n)])
        noise_kts[s-1][n-1] = kt
        noise_ps[s-1][n-1] = p
        
    
    # Get KT distances between original rank and counterfactual Y with non-resolving X
    # Average of A=0 intervention and A=1 intervention?
    
    # Get KT distances between original rank and counterfactual Y with resolving X
    # Average of A=0 intervention and A=1 intervention?
        
# Get expected KT distance between original rank and counterfactual Y with non-resolving X

# Get expected KT distance between original rank and counterfactual Y with resolving X

# Get expected KT distance between original rank and rank from re-sampled noise
# Expectation taken over samples