In [9]:
import pandas as pd
import numpy as np
from statsmodels.stats.multitest import multipletests
from scipy.stats import wilcoxon,ranksums,mannwhitneyu
# load the source data for each panel 
panel_b = pd.read_excel('./fig1_sourcedata.xlsx',engine='openpyxl',sheet_name='Panel_b',index_col=0)
panel_c = pd.read_excel('./fig1_sourcedata.xlsx',engine='openpyxl',sheet_name='Panel_c',index_col=0)
panel_d = pd.read_excel('./fig1_sourcedata.xlsx',engine='openpyxl',sheet_name='Panel_d',index_col=0)
panel_e = pd.read_excel('./fig1_sourcedata.xlsx',engine='openpyxl',sheet_name='Panel_e',index_col=0)
panel_f = pd.read_excel('./fig1_sourcedata.xlsx',engine='openpyxl',sheet_name='Panel_f',index_col=0)
panel_g = pd.read_excel('./fig1_sourcedata.xlsx',engine='openpyxl',sheet_name='Panel_g',index_col=0)

In [14]:
np.random.seed(0)
def bootstrap(arr,itr=20000):
    boots = []
    for i in range(itr):
        boots.append(np.median(np.random.choice(arr,arr.shape,replace=True)))
        
    return(boots)

def print_bootstrap_results(df,metric,lang,itr=20000):
    to_use = df[(df.Metric == metric) & (df.lang == lang)]['WER']
    #to_use = df[(df.paradigm == metric) & (df.lang == lang)]['WER']
        
    straps = np.array(bootstrap(to_use))
    median_value = np.round(np.median(to_use),2)
    low_bound = np.round(np.percentile(straps,0.5),2)
    up_bound = np.round(np.percentile(straps,99.5),2)
    print('Results for: ',metric, ' ', lang)
    print('Median: ', median_value)
    print('99% CI: [', low_bound, ', ', up_bound, ']')
    print('')


    
    

In [11]:
panel_b.head()

Unnamed: 0,WER,lang,Metric
0,38.888889,english,Real-time
1,22.222222,english,Real-time
2,20.0,english,Real-time
3,6.666667,english,Real-time
4,0.0,english,Real-time


In [15]:
print_bootstrap_results(panel_b,'Real-time','overall')
print_bootstrap_results(panel_b,'Real-time','english')
print_bootstrap_results(panel_b,'Real-time','spanish')

Results for:  Real-time   overall
Median:  25.0
99% CI: [ 17.24 ,  36.36 ]

Results for:  Real-time   english
Median:  22.22
99% CI: [ 7.14 ,  43.75 ]

Results for:  Real-time   spanish
Median:  26.67
99% CI: [ 18.18 ,  33.33 ]



In [16]:
print_bootstrap_results(panel_b,'Neural-only','overall')
print_bootstrap_results(panel_b,'Neural-only','english')
print_bootstrap_results(panel_b,'Neural-only','spanish')

Results for:  Neural-only   overall
Median:  70.62
99% CI: [ 61.88 ,  78.12 ]

Results for:  Neural-only   english
Median:  55.0
99% CI: [ 46.25 ,  68.75 ]

Results for:  Neural-only   spanish
Median:  52.5
99% CI: [ 40.42 ,  61.67 ]



In [17]:
d = {'Language': [], 'Paradigm 1': [], 'Paradigm 2': [], 'Test-statistic': [], 'P-value': []} 
compares = [('Chance','Neural-only'),('Chance','Real-time'),('Neural-only','Real-time')]
langs = ['overall','spanish','english']
all_ps = []
for lang in langs:
    for compare in compares:
        d['Language'].append(lang.capitalize())
        d['Paradigm 1'].append(compare[0])
        d['Paradigm 2'].append(compare[1])
    
        result = mannwhitneyu(panel_b['WER'][(panel_b['lang'] == lang) & (panel_b['Metric'] == compare[0])],
                 panel_b['WER'][(panel_b['lang'] == lang) & (panel_b['Metric'] == compare[1])])
        d['Test-statistic'].append(result[0])
        d['P-value'].append(result[1])

d = pd.DataFrame(d)
d['P-value'] = multipletests(d['P-value'].values,alpha=0.01,method='holm')[1]

In [18]:
print(d.to_latex(index=False,float_format="{:0.2e}".format))  

\begin{tabular}{lllrr}
\toprule
Language &  Paradigm 1 &  Paradigm 2 &  Test-statistic &  P-value \\
\midrule
 Overall &      Chance & Neural-only &        4.41e+02 & 1.91e-07 \\
 Overall &      Chance &   Real-time &        4.41e+02 & 1.91e-07 \\
 Overall & Neural-only &   Real-time &        4.40e+02 & 1.91e-07 \\
 Spanish &      Chance & Neural-only &        4.41e+02 & 1.91e-07 \\
 Spanish &      Chance &   Real-time &        4.41e+02 & 1.91e-07 \\
 Spanish & Neural-only &   Real-time &        4.13e+02 & 2.66e-06 \\
 English &      Chance & Neural-only &        4.40e+02 & 1.91e-07 \\
 English &      Chance &   Real-time &        4.41e+02 & 1.91e-07 \\
 English & Neural-only &   Real-time &        3.74e+02 & 1.24e-04 \\
\bottomrule
\end{tabular}



  print(d.to_latex(index=False,float_format="{:0.2e}".format))


In [19]:
to_use = panel_c[panel_c.paradigm == 'Real-time'].Acc
straps = np.array(bootstrap(to_use))
median_value = np.round(np.median(to_use),2)
low_bound = np.round(np.percentile(straps,0.5),2)
up_bound = np.round(np.percentile(straps,99.5),2)
print('Results for: ','Real-time')
print('Median: ', median_value)
print('99% CI: [', low_bound, ', ', up_bound, ']')
print('')





Results for:  Real-time
Median:  87.5
99% CI: [ 85.71 ,  100.0 ]



In [20]:
d = {'Evaluation 1': [], 'Evaluation 2': [], 'Test-statistic': [], 'P-value': []} 
compares = [('Chance','Neural-only'),('Chance','Real-time'),('Neural-only','Real-time')]

for compare in compares:
    d['Evaluation 1'].append(compare[0])
    d['Evaluation 2'].append(compare[1])

    result = mannwhitneyu(panel_c['Acc'][(panel_c['paradigm'] == compare[0])],
             panel_c['Acc'][(panel_c['paradigm'] == compare[1])])
    d['Test-statistic'].append(result[0])
    d['P-value'].append(result[1])

d = pd.DataFrame(d)
d['P-value'] = multipletests(d['P-value'].values,alpha=0.01,method='holm')[1]


In [21]:
print(d.to_latex(index=False,float_format="{:0.2e}".format))  

\begin{tabular}{llrr}
\toprule
Evaluation 1 & Evaluation 2 &  Test-statistic &  P-value \\
\midrule
      Chance &  Neural-only &        1.05e+02 & 4.79e-03 \\
      Chance &    Real-time &        1.50e+01 & 5.44e-07 \\
 Neural-only &    Real-time &        1.02e+02 & 4.79e-03 \\
\bottomrule
\end{tabular}



  print(d.to_latex(index=False,float_format="{:0.2e}".format))


In [22]:
print_bootstrap_results(panel_g,'Real-time','overall')
print_bootstrap_results(panel_g,'Real-time','english')
print_bootstrap_results(panel_g,'Real-time','spanish')

Results for:  Real-time   overall
Median:  21.88
99% CI: [ 16.67 ,  27.59 ]

Results for:  Real-time   english
Median:  20.0
99% CI: [ 6.67 ,  33.33 ]

Results for:  Real-time   spanish
Median:  20.0
99% CI: [ 16.67 ,  28.57 ]



In [23]:
d = {'Language': [], 'Paradigm 1': [], 'Paradigm 2': [], 'Test-statistic': [], 'P-value': []} 
compares = [('Chance','Real-time')]
langs = ['overall','spanish','english']
all_ps = []
for lang in langs:
    for compare in compares:
        d['Language'].append(lang)
        d['Paradigm 1'].append(compare[0])
        d['Paradigm 2'].append(compare[1])
    
        result = mannwhitneyu(panel_g['WER'][(panel_g['lang'] == lang) & (panel_g['Metric'] == compare[0])],
                 panel_g['WER'][(panel_g['lang'] == lang) & (panel_g['Metric'] == compare[1])])
        d['Test-statistic'].append(result[0])
        d['P-value'].append(result[1])

d = pd.DataFrame(d)
d['P-value'] = multipletests(d['P-value'].values,alpha=0.01,method='holm')[1]

In [24]:
print(d.to_latex(index=False,float_format="{:0.2e}".format))  

\begin{tabular}{lllrr}
\toprule
Language & Paradigm 1 & Paradigm 2 &  Test-statistic &  P-value \\
\midrule
 overall &     Chance &  Real-time &        4.41e+02 & 6.41e-08 \\
 spanish &     Chance &  Real-time &        4.41e+02 & 6.41e-08 \\
 english &     Chance &  Real-time &        4.41e+02 & 6.41e-08 \\
\bottomrule
\end{tabular}



  print(d.to_latex(index=False,float_format="{:0.2e}".format))


In [25]:
panel_f

Unnamed: 0,score,target-lang,correct_decode
0,0.206533,Target language,False
1,0.629613,Off-target language,False
2,0.648934,Off-target language,True
3,0.662223,Target language,True
4,0.599495,Target language,True
...,...,...,...
267,0.662223,Target language,True
268,0.475434,Target language,True
269,0.611038,Off-target language,True
270,0.000000,Off-target language,True


In [26]:
# Target vs off target language scores for INCORRECT decodes


mannwhitneyu(panel_f[np.logical_not(panel_f['correct_decode'])][panel_f['target-lang'] == 'Target language'].score.values,
             panel_f[np.logical_not(panel_f['correct_decode'])][panel_f['target-lang'] == 'Off-target language'].score.values)

  mannwhitneyu(panel_f[np.logical_not(panel_f['correct_decode'])][panel_f['target-lang'] == 'Target language'].score.values,
  panel_f[np.logical_not(panel_f['correct_decode'])][panel_f['target-lang'] == 'Off-target language'].score.values)


MannwhitneyuResult(statistic=60.0, pvalue=0.5066281058229414)

In [27]:
# Target vs off target language scores for CORRECT decodes

mannwhitneyu(panel_f[panel_f['correct_decode']][panel_f['target-lang'] == 'Target language'].score.values,
             panel_f[panel_f['correct_decode']][panel_f['target-lang'] == 'Off-target language'].score.values)

  mannwhitneyu(panel_f[panel_f['correct_decode']][panel_f['target-lang'] == 'Target language'].score.values,
  panel_f[panel_f['correct_decode']][panel_f['target-lang'] == 'Off-target language'].score.values)


MannwhitneyuResult(statistic=11177.5, pvalue=6.533119893582141e-10)