In [1]:
%matplotlib inline
import os, sys, gc
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.tri as tri
from collections import Counter
from scipy.special import factorial
import itertools
from math import comb
from scipy import stats

In [2]:
version = '1.3'
load_dir = '../data/human/{}'.format(version)
save_dir = load_dir
print(os.listdir(load_dir))

['html-button-response_processed_1.3.csv', 'processed_data_set_level_1.3.p', 'html-button-response_processed.csv', 'participants.csv', 'processed_data_exp_level_1.3.p', 'html-button-response.csv', 'exclusion_data_1.3.csv', 'survey-multi-select.csv', 'html-keyboard-response.csv', 'processed_data_1.3.p', 'demographics.csv']


# Data analysis
In this script, we apply classical and Bayesian approaches to determine whether the data provide significant evidence of deviations from independence between features and relations. We will either focus on the strong or weak MAX effect, or MAX and MIN effects together (both of these represent deviations). 

The data are presented below. (D-B) - (C-A) positive represents weak support for MAX, and Max indicates whether strong support was also found. Strong support for a participant implies weak support. 

In [6]:
fullDataDF = pd.read_pickle('{}/processed_data_exp_level_{}.p'.format(save_dir, version))
display(fullDataDF)
N = fullDataDF["N"]
mu = fullDataDF["(D-B)-(C-A)"]
sigma = fullDataDF["SD_all"]

setDataDF = pd.read_pickle('{}/processed_data_set_level_{}.p'.format(save_dir, version))
display(setDataDF)

A                5.013542
B                3.298958
C                4.538021
D                3.346354
(D-B)-(C-A)      0.522917
N_max           21.000000
N_min            7.000000
N              960.000000
N_ind          932.000000
Var_all          0.748694
SD_all           0.865271
SEM_all          0.027926
dtype: float64

Unnamed: 0,A,B,C,D,(D-B)-(C-A),N_max,N_min,N,N_ind,Var_set,SD_set,SEM_set
0,5.61875,1.921875,5.075,2.134375,0.75625,1.0,0.0,160.0,159.0,2.837461,1.684476,0.13317
1,4.365625,2.7375,3.3375,2.521875,0.8125,5.0,1.0,160.0,154.0,3.752344,1.937097,0.153141
2,5.325,4.7625,4.9375,4.5125,0.1375,3.0,1.0,160.0,156.0,3.212344,1.792301,0.141694
3,4.825,3.253125,4.709375,3.428125,0.290625,2.0,3.0,160.0,155.0,2.582725,1.607086,0.127051
4,5.1625,2.909375,4.759375,3.3,0.79375,7.0,0.0,160.0,153.0,3.948086,1.986979,0.157084
5,4.784375,4.209375,4.409375,4.18125,0.346875,3.0,2.0,160.0,155.0,4.03124,2.007795,0.15873


In [4]:
def return_statistic(alpha, beta, option="z", N=30):
    """Currently assumes two-tailed"""
    if option == "z":
        return (stats.norm.ppf(1-alpha/2), stats.norm.ppf(beta))
    elif option == "t":
        return (stats.t.ppf(1-alpha/2, N-1), stats.t.ppf(beta, N-1))
    else:
        print("Statistic not implemented yet")
        return 
    
def return_p(statistic, option="z", N=30):
    if option == "z":
        return stats.norm.cdf(statistic)
    elif option == "t":
        return stats.t.cdf(statistic, N-1)
    else:
        print("Statistic not implemented yet")
        return 
    
def stat_p_val_power(mu_hat, sigma_hat, N=None, alpha=0.05, mu_0=0, option="z"):
    """Calculate P value and power of our sample"""  
    if option=="t":
        stat = (mu_hat - mu_0)/(sigma_hat/ np.sqrt(N-1))
        power = (mu_0 - mu_hat) / (sigma_hat / np.sqrt(N-1))
        p1 = return_p(power + stats.t.ppf(alpha/2, N-1), option=option, N=N) 
        p2 = 1 - return_p(power + stats.t.ppf(1-(alpha/2), N-1), option=option, N=N) 
    elif option=="z":
        stat = (mu_hat - mu_0)/(sigma_hat/ np.sqrt(N))
        power = (mu_0 - mu_hat) / (sigma_hat / np.sqrt(N))
        
        p1 = return_p(power + stats.norm.ppf(alpha/2), option=option, N=N) 
        p2 = 1 - return_p(power + stats.norm.ppf(1-(alpha/2)), option=option, N=N) 
    
    p_val = 1 - return_p(stat, option=option, N=N) + return_p(-stat, option=option, N=N)
    power = p1 + p2
    return (stat, p_val, power)

t, p, _ = stat_p_val_power(mu, sigma, N=N, option="t")
print("All data significance is (t={}, p={}, dof={})".format(t, p, N-1))

All data significance is (t=18.714995808766133, p=3.6686348207750943e-67, dof=959.0)


In [16]:
for set_num, data in setDataDF.iterrows():
    print(set_num)
    N_t = data["N"]
    mu_t = data["(D-B)-(C-A)"]
    sigma_t = data["SD_set"]
    t_t, p_t, _ = stat_p_val_power(mu_t, sigma_t, N=N_t, option="t", alpha=0.05/6) # post-hoc
    print("All data significance is (t={}, p={}, dof={})".format(t_t, p_t, N_t-1))

0
All data significance is (t=5.661076244684501, p=6.855203473522297e-08, dof=159.0)
1
All data significance is (t=5.288964150604872, p=4.013448846514589e-07, dof=159.0)
2
All data significance is (t=0.9673647412099098, p=0.33483085416802577, dof=159.0)
3
All data significance is (t=2.2803025979165668, p=0.02391834413651879, dof=159.0)
4
All data significance is (t=5.037197762692263, p=1.2698743154908941e-06, dof=159.0)
5
All data significance is (t=2.178473205734793, p=0.030843472311027735, dof=159.0)
