In [1]:
%matplotlib inline
import os, sys, gc
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.tri as tri
from collections import Counter
from scipy.special import factorial
import itertools
from math import comb
from scipy import stats

In [2]:
version = '1.3'
load_dir = '../data/human/{}'.format(version)
save_dir = load_dir
print(os.listdir(load_dir))

['html-button-response_processed_1.3.csv', 'processed_data_set_level_1.3.p', 'html-button-response_processed.csv', 'participants.csv', 'processed_data_exp_level_1.3.p', 'html-button-response.csv', 'exclusion_data_1.3.csv', 'survey-multi-select.csv', 'html-keyboard-response.csv', 'demographics.numbers', 'processed_data_1.3.p', 'demographics.csv']


# Data analysis
In this script, we apply classical and Bayesian approaches to determine whether the data provide significant evidence of deviations from independence between features and relations. We will either focus on the strong or weak MAX effect, or MAX and MIN effects together (both of these represent deviations). 

The data are presented below. (D-B) - (C-A) positive represents weak support for MAX, and Max indicates whether strong support was also found. Strong support for a participant implies weak support. 

In [52]:
fullDataDF = pd.read_pickle('{}/processed_data_exp_level_{}.p'.format(save_dir, version)).to_frame()
fullDataDF.set_axis({'0': 'Aggregate Data'}, axis=1, inplace=False)

display(fullDataDF.round(2))

N = fullDataDF.loc["N"].iloc[0]
mu = fullDataDF.loc["(D-B)-(C-A)"].iloc[0]
sigma = fullDataDF.loc["SD_all"].iloc[0]
print('Mu is {}, sigma is {}, N is {}'.format(mu, sigma, N))
setDataDF = pd.read_pickle('{}/processed_data_set_level_{}.p'.format(save_dir, 
                                                                     version)).transpose().set_axis(['Set {}'.format(x) for x in np.arange(1, 7)], axis=1, inplace=False)

setDataDF['Mean'] = setDataDF.mean(numeric_only=True, axis=1)
display(setDataDF.round(2))

Unnamed: 0,0
A,5.01
B,3.3
C,4.54
D,3.35
(D-B)-(C-A),0.52
N_max,21.0
N_min,7.0
N,960.0
N_ind,932.0
Var_all,0.75


Mu is 0.5229166666666667, sigma is 0.8652708110117254, N is 960.0


Unnamed: 0,Set 1,Set 2,Set 3,Set 4,Set 5,Set 6,Mean
A,5.62,4.37,5.32,4.82,5.16,4.78,5.01
B,1.92,2.74,4.76,3.25,2.91,4.21,3.3
C,5.08,3.34,4.94,4.71,4.76,4.41,4.54
D,2.13,2.52,4.51,3.43,3.3,4.18,3.35
(D-B)-(C-A),0.76,0.81,0.14,0.29,0.79,0.35,0.52
N_max,1.0,5.0,3.0,2.0,7.0,3.0,3.5
N_min,0.0,1.0,1.0,3.0,0.0,2.0,1.17
N,160.0,160.0,160.0,160.0,160.0,160.0,160.0
N_ind,159.0,154.0,156.0,155.0,153.0,155.0,155.33
Var_set,2.84,3.75,3.21,2.58,3.95,4.03,3.39


In [54]:
def return_statistic(alpha, beta, option="z", N=30):
    """Currently assumes two-tailed"""
    if option == "z":
        return (stats.norm.ppf(1-alpha/2), stats.norm.ppf(beta))
    elif option == "t":
        return (stats.t.ppf(1-alpha/2, N-1), stats.t.ppf(beta, N-1))
    else:
        print("Statistic not implemented yet")
        return 
    
def return_p(statistic, option="z", N=30):
    if option == "z":
        return stats.norm.cdf(statistic)
    elif option == "t":
        return stats.t.cdf(statistic, N-1)
    else:
        print("Statistic not implemented yet")
        return 
    
def stat_p_val_power(mu_hat, sigma_hat, N=None, alpha=0.05, mu_0=0, option="z"):
    """Calculate P value and power of our sample"""  
    if option=="t":
        stat = (mu_hat - mu_0)/(sigma_hat/ np.sqrt(N-1))
        power = (mu_0 - mu_hat) / (sigma_hat / np.sqrt(N-1))
        p1 = return_p(power + stats.t.ppf(alpha/2, N-1), option=option, N=N) 
        p2 = 1 - return_p(power + stats.t.ppf(1-(alpha/2), N-1), option=option, N=N) 
    elif option=="z":
        stat = (mu_hat - mu_0)/(sigma_hat/ np.sqrt(N))
        power = (mu_0 - mu_hat) / (sigma_hat / np.sqrt(N))
        
        p1 = return_p(power + stats.norm.ppf(alpha/2), option=option, N=N) 
        p2 = 1 - return_p(power + stats.norm.ppf(1-(alpha/2)), option=option, N=N) 
    
    p_val = (1 - return_p(stat, option=option, N=N)) + return_p(-stat, option=option, N=N)
    power = p1 + p2
    return (stat, p_val, power)

t, p, _ = stat_p_val_power(mu, sigma, N=N, option="t")

print("All data significance is (t={}, p={}, dof={})".format(t, p, N-1))

All data significance is (t=18.714995808766133, p=3.6686348207750943e-67, dof=959.0)


In [82]:
results = []
for set_num, data in setDataDF.iteritems():
    print(data)
    N_t = data.loc["N"]
    mu_t = data.loc["(D-B)-(C-A)"]
    sigma_t = data.loc["SD_set"]
    t_t, p_t, _ = stat_p_val_power(mu_t, sigma_t, N=N_t, option="t", alpha=0.05/6) # post-hoc
    results.append([t_t, p_t, N_t-1])
    print("All data significance is (t={}, p={}, dof={})".format(t_t, p_t, N_t-1))
results.pop()

sigRes = pd.DataFrame(results)
sigRes.set_axis(['t', 'p', 'DOF'], axis=1, inplace=True)
sigRes.set_axis(['Set {}'.format(x) for x in np.arange(1, 7)], axis=0, inplace=True)
display(sigRes.round(5))

A                5.618750
B                1.921875
C                5.075000
D                2.134375
(D-B)-(C-A)      0.756250
N_max            1.000000
N_min            0.000000
N              160.000000
N_ind          159.000000
Var_set          2.837461
SD_set           1.684476
SEM_set          0.133170
Name: Set 1, dtype: float64
All data significance is (t=5.661076244684501, p=6.855203473522297e-08, dof=159.0)
A                4.365625
B                2.737500
C                3.337500
D                2.521875
(D-B)-(C-A)      0.812500
N_max            5.000000
N_min            1.000000
N              160.000000
N_ind          154.000000
Var_set          3.752344
SD_set           1.937097
SEM_set          0.153141
Name: Set 2, dtype: float64
All data significance is (t=5.288964150604872, p=4.013448846514589e-07, dof=159.0)
A                5.325000
B                4.762500
C                4.937500
D                4.512500
(D-B)-(C-A)      0.137500
N_max            3.00000

Unnamed: 0,t,p,DOF
Set 1,5.66108,0.0,159.0
Set 2,5.28896,0.0,159.0
Set 3,0.96736,0.33483,159.0
Set 4,2.2803,0.02392,159.0
Set 5,5.0372,0.0,159.0
Set 6,2.17847,0.03084,159.0
