In [153]:
import pingouin as pg
import pandas as pd
import scikit_posthocs as sp

<h1 style="text-align: center" >Analysis Functions</h1>

In [None]:
def assumptions_test(data_path):
    data = pd.read_csv(data_path)
    groups = len(data.columns)

    data = data.melt(var_name='Group').dropna().reset_index(drop=True)
    observations = len(data)
    
    shape = (f"No. Observations: {observations}\nNo. Groups: {groups}")

    shapiro_wilk_result = pg.normality(data, group='Group', dv='value')
    levene_result = pg.homoscedasticity(data, group='Group', dv='value', method='levene')
    

    return shapiro_wilk_result, levene_result, shape, data

In [87]:
def group_comparison(data, test):
    if test == "kruskal":
        kruskal_result = pg.kruskal(data, between='Group', dv='value')
        kruskal_result['Significance'] = kruskal_result['p-unc'].map(lambda x: True if x < 0.05 else False)
        return kruskal_result
    

<h1 style="text-align: center" >Cancer</h1>

<h2>Test Assumptions</h2>

In [130]:
cancer_normality, cancer_homoscedasticity, cancer_shape, cancer_data = assumptions_test("variable_data/cancer.csv")

<h3>Data Dimensions</h3>

In [132]:
print(cancer_shape)

No. Observations: 372
No. Groups: 3


<h3>Normality Test (Shapiro-Wilk)</h3>

In [133]:
cancer_normality

Unnamed: 0_level_0,W,pval,normal
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Clinically healthy,0.870509,2.069097e-14,False
Single condition: Cancer,0.907791,0.001133087,False
Cancer and at least one other condition,0.851004,1.393117e-05,False


<h3>Homogeneity of Variances (Levene's)</h3>

In [134]:
cancer_homoscedasticity

Unnamed: 0,W,pval,equal_var
levene,3.790998,0.023457,False


<h2>Group Comparison (Kruskal-Walis)</h2>

In [137]:
cancer_kruskal = group_comparison(cancer_data, test='kruskal')

In [138]:
cancer_kruskal

Unnamed: 0,Source,ddof1,H,p-unc,Significance
Kruskal,Group,2,27.113771,1e-06,True


<h2>Post-Hoc Test (Dunn)</h2>

In [139]:
cancer_dunn_result = sp.posthoc_dunn(cancer_data, group_col="Group", val_col="value", p_adjust='holm')

In [141]:
cancer_dunn_result.map(lambda x: f"{x:.7f} *" if x < 0.05 else f"{x:.5f}")

Unnamed: 0,Cancer and at least one other condition,Clinically healthy,Single condition: Cancer
Cancer and at least one other condition,1.00000,0.0154261 *,0.08226
Clinically healthy,0.0154261 *,1.00000,0.0000041 *
Single condition: Cancer,0.08226,0.0000041 *,1.00000


<h1 style="text-align: center" >Cardiovascular Disease</h1>

<h2>Test Assumptions</h2>

In [144]:
cvd_normality, cvd_homoscedasticity, cvd_shape, cvd_data = assumptions_test("variable_data/cvd.csv")

<h3>Data Dimensions</h3>

In [145]:
print(cvd_shape)

No. Observations: 615
No. Groups: 4


<h3>Normality Test (Shapiro-Wilk)</h3>

In [147]:
cvd_normality

Unnamed: 0_level_0,W,pval,normal
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Single condition (CVD),0.847208,5.048727e-09,False
CVD and at least one other condition,0.861485,3.227682e-06,False
Non-cardiovascular comorbidities,0.862405,2.046331e-11,False
No known comorbidities,0.870509,2.069097e-14,False


<h3>Homogeneity of Variances (Levene's)</h3>

In [148]:
cvd_homoscedasticity

Unnamed: 0,W,pval,equal_var
levene,2.079986,0.101678,True


<h2>Group Comparison (Kruskal-Walis)</h2>

In [149]:
cvd_kruskal = group_comparison(cvd_data, test='kruskal')

In [150]:
cvd_kruskal

Unnamed: 0,Source,ddof1,H,p-unc,Significance
Kruskal,Group,3,11.913735,0.007685,True


<h2>Post-Hoc Test (Dunn)</h2>

In [151]:
cvd_dunn_result = sp.posthoc_dunn(cvd_data, group_col="Group", val_col="value", p_adjust='holm')

In [152]:
cvd_dunn_result.map(lambda x: f"{x:.7f} *" if x < 0.05 else f"{x:.5f}")

Unnamed: 0,CVD and at least one other condition,No known comorbidities,Non-cardiovascular comorbidities,Single condition (CVD)
CVD and at least one other condition,1.00000,0.14082,0.46803,0.0292309 *
No known comorbidities,0.14082,1.0,0.17217,0.36159
Non-cardiovascular comorbidities,0.46803,0.17217,1.00000,0.0312200 *
Single condition (CVD),0.0292309 *,0.36159,0.0312200 *,1.00000


<h1 style="text-align: center" >Physical Activity</h1>

<h2>Test Assumptions</h2>

In [None]:
physical_normality, physical_homoscedasticity, physical_shape, physical_data = assumptions_test("variable_data/physical.csv")

<h3>Data Dimensions</h3>

In [None]:
print(physical_shape)

No. Observations: 609
No. Groups: 3


<h3>Normality Test (Shapiro-Wilk)</h3>

In [None]:
physical_normality

Unnamed: 0_level_0,W,pval,normal
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Sedentary, Minimally, Lightly active",0.906515,4.347425e-10,False
Moderately active,0.835726,2.120031e-11,False
Highly active,0.82408,1.926432e-16,False


<h3>Homogeneity of Variances (Levene's)</h3>

In [None]:
physical_homoscedasticity

Unnamed: 0,W,pval,equal_var
levene,2.548281,0.079054,True


<h2>Group Comparison (Kruskal-Walis)</h2>

In [None]:
physical_kruskal = group_comparison(physical_data, test="kruskal")

In [None]:
physical_kruskal

Unnamed: 0,Source,ddof1,H,p-unc,Significance
Kruskal,Group,2,14.721032,0.000636,True


<h2>Post-Hoc Test (Dunn)</h2>

In [142]:
physical_dunn_result = sp.posthoc_dunn(physical_data, group_col="Group", val_col="value", p_adjust='holm')

In [143]:
physical_dunn_result.map(lambda x: f"{x:.5f} *" if x < 0.05 else f"{x:.5f}")

Unnamed: 0,Highly active,Moderately active,"Sedentary, Minimally, Lightly active"
Highly active,1.00000,0.1892,0.00038 *
Moderately active,0.18920,1.0,0.08366
"Sedentary, Minimally, Lightly active",0.00038 *,0.08366,1.00000


<h1 style="text-align: center" >Age</h1>

<h2>Test Assumptions</h2>

In [88]:
age_normality, age_homoscedasticity, age_shape, age_data = assumptions_test("variable_data/age.csv")

<h3>Data Dimensions</h3>

In [89]:
print(age_shape)

No. Observations: 615
No. Groups: 5


<h3>Normality Test (Shapiro-Wilk)</h3>

In [90]:
age_normality

Unnamed: 0_level_0,W,pval,normal
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adolescent,0.879088,0.01703723,False
Young Adult,0.85533,5.184404e-16,False
Middle-Aged Adult,0.880857,2.844875e-08,False
Elderly (Young-Old),0.868207,2.269129e-10,False
Oldest-Old,0.806206,0.0001756413,False


<h3>Homogeneity of Variances (Levene's)</h3>

In [91]:
age_homoscedasticity

Unnamed: 0,W,pval,equal_var
levene,1.217865,0.30196,True


<h2>Group Comparison (Kruskal-Walis)</h2>

In [92]:
age_kruskal = group_comparison(age_data, test="kruskal")

In [93]:
age_kruskal 

Unnamed: 0,Source,ddof1,H,p-unc,Significance
Kruskal,Group,4,5.460023,0.243266,False


<h1 style="text-align: center" >Sex</h1>

<h2>Test Assumptions</h2>

In [94]:
sex_normality, sex_homoscedasticity, sex_shape, sex_data = assumptions_test("variable_data/sex.csv")

<h3>Data Dimensions</h3>

In [95]:
print(sex_shape)

No. Observations: 615
No. Groups: 2


<h3>Normality Test (Shapiro-Wilk)</h3>

In [96]:
sex_normality

Unnamed: 0_level_0,W,pval,normal
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,0.840864,2.736691e-16,False
Female,0.876981,1.023374e-15,False


<h3>Homogeneity of Variances (Levene's)</h3>

In [97]:
sex_homoscedasticity

Unnamed: 0,W,pval,equal_var
levene,0.538139,0.463486,True


<h2>t-Test</h2>

In [98]:
sex_raw_data = pd.read_csv("variable_data/sex.csv")
sex_ttest = pg.ttest(x=sex_raw_data['Male'], y=sex_raw_data['Female'])
sex_ttest['Significance'] = sex_ttest['p-val'].map(lambda p: True if p < 0.05 else False)

In [99]:
sex_ttest

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power,Significance
T-test,-0.618002,597.61657,two-sided,0.53681,"[-1.72, 0.89]",0.049954,0.109,0.094472,False
