In [7]:
# imports

import pandas as pd
from scipy import stats
from scipy.stats import mannwhitneyu

# for creating pair combinations
from itertools import combinations
import seaborn as sns

pd.set_option('display.max_rows', 100)


In [8]:
df = pd.read_excel('result_data.xlsx', index_col=None)
df.head(5)

Unnamed: 0,cb_lin,cb_lasso,cb_ridge,mondrian_lin,mondrian_lasso,mondrian_ridge,ola_lin,ola_ridge,ola_lasso,tdg_lin,tdg_ridge,tdg_lasso
0,53.672852,53.497689,53.729505,52.887949,52.887899,52.586632,22.994265,22.579093,22.992484,53.50111,52.717343,52.947963
1,53.055599,52.799154,53.144808,50.619647,50.619737,50.321428,51.052447,50.73019,51.051379,53.316143,52.35714,53.367357
2,52.336882,52.892518,52.820872,50.214292,50.214209,49.891701,52.87761,52.518651,52.876615,52.545893,51.879003,52.639397
3,52.106122,52.611953,52.049599,47.829836,47.829716,47.484848,51.961017,51.610548,51.959846,52.121545,52.434886,52.390137
4,51.648057,51.394093,51.769857,47.207387,47.206748,46.857136,22.758017,22.386737,22.756592,51.602896,51.257857,52.310995


In [9]:
# Stats test
# cb_lin = df['cb_lin']
# cb_lasso = df['cb_lasso']


# stats.ttest_ind(cb_lin, cb_lasso)

In [14]:
# NULL HYPOTHESIS: distribution is the same

# p value < 0.05 = reject null hypothesis, therefore distribution is NOT the same.
# p value > 0.05 = cannot reject null hypothesis, therefore no proof yet that distribution is not the same.


p_value_list = []
combi1_list = []
combi2_list = []

# Acquiring all possible combinations from the df
all_combinations= list(combinations(df.columns,2))
# Loop through all possible combinations of model and anonymization method result 
for i in all_combinations:
    data=pd.DataFrame(df.loc[:,i])
    result1 = pd.DataFrame(data.iloc[:,[0]])
    result2 = pd.DataFrame(data.iloc[:,[1]])
    
    # Perform the Mann-Whitney u test on each combination
    U1, p = mannwhitneyu(result1, result2, method='exact')
    p_value_list.append(p)
    combi1_list.append(result1.columns.values)
    combi2_list.append(result2.columns.values)



# Cleaning lists
def array_to_list(uncleaned_list):
    cleaned_list = []
    for array in uncleaned_list:
        cleaned_list.append(array.tolist())
        
    return cleaned_list


combi1_list_cleaned = array_to_list(combi1_list)
combi2_list_cleaned = array_to_list(combi2_list)
p_value_list_cleaned = array_to_list(p_value_list)

combi1_list_cleaned = [val for sublist in combi1_list_cleaned for val in sublist]
combi2_list_cleaned = [val for sublist in combi2_list_cleaned for val in sublist]
p_value_list_cleaned = [val for sublist in p_value_list_cleaned for val in sublist]

# Creating a dataframe of all the lists of results 
p_value_df = pd.DataFrame(
    {'combi1': combi1_list_cleaned,
     'combi2': combi2_list_cleaned,
     'p_value': p_value_list_cleaned,
    })

# 1) Printing statistical results of all 66 combinations:
print("Printing all statistical results:")
display(p_value_df)

# For these combinations, the p value is GREATER than 5 which means there is no proof to say that the distributions are DISTINCT. 
# For the rest, which has a p value SMALLER than 5, we can conclude that there is no proof that these distributions are distinct

# 2) Printing all p values larger than 5:
# p_larger_5 = p_value_df[p_value_df['p_value'] > 0.05]
# p_larger_5
# #p_larger_5.to_csv('p_larger_5.csv', index=False)
# display(p_larger_5)


Printing all statistical results:


Unnamed: 0,combi1,combi2,p_value
0,cb_lin,cb_lasso,0.9664417
1,cb_lin,cb_ridge,0.8102799
2,cb_lin,mondrian_lin,0.003053225
3,cb_lin,mondrian_lasso,0.003003502
4,cb_lin,mondrian_ridge,0.001268759
5,cb_lin,ola_lin,9.351722000000001e-23
6,cb_lin,ola_ridge,6.328806000000001e-23
7,cb_lin,ola_lasso,9.351722000000001e-23
8,cb_lin,tdg_lin,0.03588598
9,cb_lin,tdg_ridge,0.09122138


In [6]:
# formatting of tables

def color_negative_red(value):
    """
  Colors elements in a dateframe
  green if positive and red if
  negative. Does not color NaN
  values.
  """

    if value < 0:
        color = 'red'
    elif value > 0:
        color = 'green'
    else:
        color = 'black'

    return 'color: %s' % color


# Set CSS properties for th elements in dataframe
th_props = [
  ('font-size', '11px'),
  ('text-align', 'center'),
  ('font-weight', 'bold'),
  ('color', 'black'),
  ('background-color', '#f7f7f9')
  ]

# Set CSS properties for td elements in dataframe
td_props = [
  ('font-size', '11px')
  ]

# Set table styles
styles = [
  dict(selector="th", props=th_props),
  dict(selector="td", props=td_props)
  ]


cm = sns.light_palette("blue", as_cmap=True)

(p_larger_5.style
  .background_gradient(cmap=cm, subset=['p_value'])
  .highlight_max(subset=['p_value'])
  .set_caption('This is a custom caption.')
  .format({'p_value': "{:.2%}"})
  .set_table_styles(styles))

# Set colormap equal to seaborns light green color palette
cm = sns.light_palette("#00FF00", as_cmap=True)

(p_larger_5.style
  .background_gradient(cmap=cm, subset=['p_value'])
  .highlight_max(subset=['p_value'])
  .set_caption('This is a custom caption.')
  .format({'p_value': "{:.2}"})
  .set_table_styles(styles))

#display(p_larger_5)

Unnamed: 0,combi1,combi2,p_value
0,cb_lin,cb_lasso,0.97
1,cb_lin,cb_ridge,0.81
9,cb_lin,tdg_ridge,0.091
11,cb_lasso,cb_ridge,0.82
19,cb_lasso,tdg_ridge,0.092
30,mondrian_lin,mondrian_lasso,0.88
31,mondrian_lin,mondrian_ridge,0.16
38,mondrian_lasso,mondrian_ridge,0.16
52,ola_lin,ola_lasso,0.12
63,tdg_lin,tdg_ridge,0.53


In [9]:
(p_value_df.style
  .background_gradient(cmap=cm, subset=['p_value'])
  .highlight_max(subset=['p_value'])
  .set_caption('This is a custom caption.')
  .format({'p_value': "{:.2}"})
  .set_table_styles(styles))

Unnamed: 0,combi1,combi2,p_value
0,cb_lin,cb_lasso,0.97
1,cb_lin,cb_ridge,0.81
2,cb_lin,mondrian_lin,0.0031
3,cb_lin,mondrian_lasso,0.003
4,cb_lin,mondrian_ridge,0.0013
5,cb_lin,ola_lin,9.4e-23
6,cb_lin,ola_ridge,6.3e-23
7,cb_lin,ola_lasso,9.4e-23
8,cb_lin,tdg_lin,0.036
9,cb_lin,tdg_ridge,0.091
