# Calculate stats for clinical metadata.

In [2]:
import pandas as pd
from scipy.stats import chisquare
from scipy.stats import f_oneway


In [3]:
#Import metadata
md=pd.read_table("11129_20200129-223118.txt")

#Format metadata
md = md.query('analysis_t1d != "not applicable" & host_age != "restricted access"')\
.groupby('anonymized_name')\
.agg("first")

# Sex

In [48]:
sex_data = md.groupby('analysis_disease_stage_group').sex.apply(lambda x: (sum(x=="male"),len(x)))

observed = [x[0] for x in sex_data.values]
expected = [x[1]/2 for x in sex_data.values]

sex_data

analysis_disease_stage_group
NH      (21, 33)
NT1D    (17, 33)
OH      (13, 26)
OT1D    (12, 24)
Name: sex, dtype: object

In [45]:
chisquare(observed, expected)

Power_divergenceResult(statistic=1.2424242424242424, pvalue=0.74284822918191584)

# Age

In [14]:
ages = [md.query('analysis_disease_stage_group == @x').host_age.values for x in set(md.analysis_disease_stage_group)]
f_oneway(*ages)


F_onewayResult(statistic=0.42630734171017021, pvalue=0.73452417938352488)

# Height

In [16]:
heights = [md.query('analysis_disease_stage_group == @x').host_height.values for x in set(md.analysis_disease_stage_group)]
f_oneway(*heights)


F_onewayResult(statistic=0.63544512553345744, pvalue=0.59369800838505615)

# Race

In [78]:
md.groupby('analysis_disease_stage_group').race.value_counts()

analysis_disease_stage_group  race 
NH                            White    25
                              Other     8
NT1D                          White    26
                              Other     5
                              Asian     2
OH                            Other    13
                              White    11
                              Asian     1
                              Black     1
OT1D                          White    13
                              Other     8
                              Black     3
Name: race, dtype: int64

In [75]:
def chi_sq_race(md, race_to_test):
    #Get data on race to test
    race_data = md.groupby('analysis_disease_stage_group').race.apply(lambda x: (sum(x==race_to_test),len(x)))
    #Get observed counts of that race
    observed = [x[0] for x in race_data.values]
    #Get total individuals per group
    n_per_group = [x[1] for x in race_data.values]
    total = sum(n_per_group)
    #Get proportions that would be expected
    proportions = [x/total for x in n_per_group]
    total_white = sum(observed)
    #Get expected values
    expected=[total_white*x for x in proportions]
    #Calculate chi square
    result = chisquare(observed, expected)
    return(result)

In [82]:
#Calculate chi square for white individuals. Counts too low to calculate for other ethnicities.
chi_sq_race(md, "White")

Power_divergenceResult(statistic=4.065221445221443, pvalue=0.2545069720850246)