In [1]:
##Latest notebook visualizing results for Distribution Paper by Julia Geller
##Updated:  07/02/2022

# Required to access the database
import os
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"


import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

# Data analysis tools
import pandas as pd
import numpy as np
import seaborn as sns

# Models available in our application
from datasets.models import RawFlower, RawUNM, RawDAR, RawNHANES_BIO
from django.contrib.auth.models import User
from datasets.models import RawDictionary


from datasets.models import RawNEU
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels

from api import adapters

In [2]:
##merges 4 cohorts based on columns in common
def merge4CohortFrames2(df1, df2, df3,df4):
    'merge on feature intersections'

    for as_feature in ['UASB', 'UDMA', 'UAS5', 'UIAS', 'UAS3', 'UMMA']:
        if as_feature not in df1.columns:
            df1[as_feature] = np.nan
        if as_feature not in df2.columns:
            df2[as_feature] = np.nan
        if as_feature not in df3.columns:
            df3[as_feature] = np.nan

    s1 = set(df1.columns)
    s2 = set(df2.columns)
    s3 = set(df3.columns)
    s4 = set(df4.columns)

    cc = set.intersection(s1, s2, s3,s4)

    df_all = pd.concat([df1[cc],df2[cc],df3[cc], df4[cc]])

    return (df_all, cc)

In [3]:
##Create aliases
NEU_alias = 'PROTECT'
DAR_alias = 'NHBCS'
UNM_alias = 'Navajo'
NHANES_alias = 'NHANES' 
## Get the data

## NEU (Cohort 1)
df_NEU = adapters.neu.get_dataframe_orig()
df_NEU = df_NEU[df_NEU['TimePeriod']==2] # Visit 2

NEU_covars = adapters.neu.get_dataframe_covars()
df_NEU_covars = NEU_covars.merge(df_NEU, on = ['PIN_Patient','CohortType','TimePeriod']) #Merge the covariates
df_NEU_covars['CohortType'] = NEU_alias
df_NEU['CohortType'] = NEU_alias


df_NEU_blod = adapters.neu.get_dataframe_BLOD()
df_NEU_blod['CohortType'] = NEU_alias
df_NEU_blod = df_NEU_blod[df_NEU_blod['TimePeriod']==2]


##DAR (Cohort 2)

df_DAR = adapters.dar.get_dataframe()
df_DAR['CohortType'] == DAR_alias

df_DAR_blod = adapters.dar.get_dataframe_BLOD()
df_DAR_blod['CohortType'] = DAR_alias

''''DAR_covars = adapters.dar.get_dataframe_covars()
df_DAR_covars = DAR_covars.merge(df_DAR, on = ['PIN_Patient','CohortType','TimePeriod']) #Merge the covariates
'''
df_DAR['CohortType'] = DAR_alias




df_UNM = adapters.unm.get_dataframe_orig()
df_UNM['CohortType'] == UNM_alias

df_UNM_blod = adapters.unm.get_dataframe_BLOD()
df_UNM_blod['CohortType'] = UNM_alias

UNM_covars = adapters.unm.get_dataframe_covars()
df_UNM_covars = UNM_covars.merge(df_UNM, on = ['PIN_Patient','CohortType','TimePeriod']) #Merge the covariates
df_UNM_covars['CohortType'] = UNM_alias
'''

##!!DELETE FOR WHEN FINALIZING RESULTS 
df_UNM = df_NEU.copy()
df_UNM['CohortType'] = UNM_alias
df_UNM_blod = df_NEU_blod.copy()
df_UNM_blod['CohortType'] = UNM_alias
df_UNM_covars = df_NEU_covars.copy()
df_UNM_covars['CohortType'] = UNM_alias

'''

##NHANES 

df_NHANES = adapters.nhanes.get_dataframe_orig()
df_NHANES = df_NHANES.rename_axis(None, axis=1)

df_NHANES_blod = adapters.nhanes.get_dataframe_orig_blod()

NHANES_covars = adapters.nhanes.get_dataframe_covars()
NHANES_covars['CohortType'] = NHANES_alias

##df_NHANES_covars = NHANES_covars.merge(df_NHANES, on = ['PIN_Patient','CohortType','TimePeriod'])
df_NHANES['CohortType'] = NHANES_alias
df_NHANES_blod['CohortType'] = NHANES_alias

df_ALL, intersec_cols = merge4CohortFrames2(df_NEU, df_UNM, df_DAR, df_NHANES)


## df_ALL = analysis.merge3CohortFrames(df_NEU, df_UNM, df_DAR)
frames_for_analysis = [
    (NEU_alias, df_NEU),
    (DAR_alias, df_DAR),
    (UNM_alias, df_UNM),
    (NHANES_alias, df_NHANES),
    ('ALL', df_ALL)

]

for name, df in frames_for_analysis:
    print('Data Stats')
    print(name)
    print(df.shape)

Data Stats
PROTECT
(570, 32)
Data Stats
NHBCS
(2152, 198)
Data Stats
Navajo
(521, 30)
Data Stats
NHANES
(8583, 23)
Data Stats
ALL
(11826, 16)


In [4]:
df_NHANES_blod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8744 entries, 0 to 8743
Data columns (total 23 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PIN_Patient  8744 non-null   int64  
 1   TimePeriod   8744 non-null   object 
 2   UALB_mg      6376 non-null   float64
 3   UALB_ug      6376 non-null   float64
 4   UBA          2893 non-null   float64
 5   UCD          2893 non-null   float64
 6   UCO          2893 non-null   float64
 7   UCR          621 non-null    float64
 8   UCRT_mg      4055 non-null   float64
 9   UCRT_umol    4055 non-null   float64
 10  UCS          2893 non-null   float64
 11  UHG          2895 non-null   float64
 12  UI           1354 non-null   float64
 13  UMN          2893 non-null   float64
 14  UMO          2892 non-null   float64
 15  UNI          621 non-null    float64
 16  UPB          2893 non-null   float64
 17  USB          2893 non-null   float64
 18  USN          2892 non-null   float64
 19  UTAS  

In [5]:
# analytes in common acrosss all 4 cohorts
intersec_analytes = list(intersec_cols)
intersec_analytes.remove('CohortType')
intersec_analytes.remove('PIN_Patient')
intersec_analytes.remove('TimePeriod')
intersec_analytes

# Analyte adjustments
def adjust(neu, unm, dar, nhanes, cols):

    for conc_var in cols:
        dar[conc_var] = dar[conc_var] * (np.nanmedian(dar['urine_specific_gravity']) - 1) / (dar['urine_specific_gravity']-1)

    for conc_var in cols:
        unm[conc_var] = unm[conc_var] * (np.nanmedian(unm['creatininemgdl']) - 1) / (unm['creatininemgdl']-1)
        #unm[conc_var] = unm[conc_var] * (np.nanmedian(unm['urine_specific_gravity']) - 1) / (unm['urine_specific_gravity']-1)
    
    for conc_var in cols:
        neu[conc_var] = neu[conc_var] * (np.nanmedian(neu['SPECIFICGRAVITY_V2']) - 1) / (neu['SPECIFICGRAVITY_V2']-1)

    for conc_var in cols:
        nhanes[conc_var] = nhanes[conc_var] * (np.nanmedian(nhanes['UCRT_mg']) - 1) / (nhanes['UCRT_mg']-1)
        
    return neu, unm, dar, nhanes

df_NEU, df_UNM, df_DAR, df_NHANES = adjust(df_NEU_covars, df_UNM_covars, df_DAR, df_NHANES, intersec_analytes)


In [8]:
dfs = [(df_NEU,df_NEU_blod), (df_UNM, df_UNM_blod), (df_DAR, df_DAR_blod), (df_NHANES, df_NHANES_blod)]


In [9]:
def make_above_lod_df(dfs, intersec_cols, dar_alias):
    flag = False
    res = [df_NEU.copy(), df_UNM.copy(), df_DAR.copy(), df_NHANES.copy()]
    for j in range(0, 4):
        # Creating copy of NEU df where all blod values are NaN
        df_res = res[j]
        blod_id_col = 'PIN_Patient'
        for i in range(0, len(df_res)):
            if df_res['CohortType'].unique() == dar_alias:
                flag = True
                blod_id_col = 'unq_id'
            row = df_res.iloc[i]
            for a in intersec_cols:
                blod_df = dfs[j][1]
                blod_row = blod_df[blod_df[blod_id_col] == row['PIN_Patient']]
                if blod_row[a].values[0] == 0.0:
                    if flag:
                        df_res.at[i, a+'_BLOD'] = np.NaN
                    df_res.at[i, a] = np.NaN
        return res[0], res[1], res[2], res[3]
dfs = [(df_NEU,df_NEU_blod), (df_UNM, df_UNM_blod), (df_DAR, df_DAR_blod), (df_NHANES, df_NHANES_blod)]

df_NEU_alod, df_UNM_alod, df_DAR_alod, df_NHANES_alod = make_above_lod_df(dfs, intersec_analytes, DAR_alias)
df_ALL_alod, intersec_cols_alod = merge4CohortFrames2(df_NEU_alod, df_UNM_alod, df_DAR_alod, df_NHANES_alod)


**Summary Statistics**

In [10]:
##limit summary statistic values to two decimal places
def clean_cols(desc_DF):
    desc_DF["count"]=desc_DF["count"].astype(int)
    desc_DF["mean"]=round(desc_DF["mean"],2)
    desc_DF["std"]=round(desc_DF["std"],2)
    desc_DF["min"]=round(desc_DF["min"],2)
    desc_DF["25%"]=round(desc_DF["25%"],2)
    desc_DF["50%"]=round(desc_DF["50%"],2)
    desc_DF["75%"]=round(desc_DF["75%"],2)
    desc_DF["max"]=round(desc_DF["max"],2)

def desc_4_cohs(df_neu, df_dar, df_unm, df_nhanes, fi_name):
    ##columns to describe
    desc_cols = []
    for col in df_ALL.columns:
        if col not in ['PIN_Patient', 'TimePeriod', 'CohortType']:
            desc_cols.append(col)
    # NEU
    desc_neu = df_neu[desc_cols].describe().transpose()
    clean_cols(desc_neu)

    # DAR 
    desc_dar = df_dar[desc_cols].describe().transpose()
    clean_cols(desc_dar)

    # UNM
    desc_unm = df_unm[desc_cols].describe().transpose()
    clean_cols(desc_unm)

    # NHANES
    desc_nhanes = df_nhanes[desc_cols].describe().transpose()
    clean_cols(desc_nhanes)

    #Saving each summary dataframe to a csv file.
    frames_names = [(desc_neu,NEU_alias), (desc_dar, DAR_alias),(desc_unm, UNM_alias), (desc_nhanes,NHANES_alias)]


    try:
        f = open(fi_name+".csv", "x")
        f = open(fi_name+".csv", "a")

    except:
        f = open(fi_name+".csv", "a")

    for tup in frames_names:
        frame=tup[0]
        latex_frame=tup[0].to_latex()
        name=tup[1]
        ##content=string(name ,"/n",frame,"\n\n")
        f.write(name + ',')
        f.write("\n\t")
        f.write(str(frame) + ',')
        f.write("\n\n")
        f.write(latex_frame)

    f.close()
    return frames_names, desc_cols


In [11]:
frames_names, desc_cols = desc_4_cohs(df_NEU, df_DAR, df_UNM, df_NHANES, "Summary_Stats_DP_Paper_JGeller")
frames_names_alod, desc_cols_alod = desc_4_cohs(df_NEU_alod, df_DAR_alod, df_UNM_alod, df_NHANES_alod, "ALOD_Summary_Stats_DP_Paper_JGeller")

In [12]:
def df_to_log_cols(df, intersec):
    df_res = df.copy()
    for col in intersec:
        if col not in ['CohortType','PIN_Patient', 'TimePeriod']:
            df_res[col] = np.log10(df[col])
    return df_res

In [13]:
df_NEU_log = df_to_log_cols(df_NEU, intersec_cols)
df_DAR_log = df_to_log_cols(df_DAR, intersec_cols)
df_UNM_log = df_to_log_cols(df_UNM, intersec_cols)
df_NHANES_log = df_to_log_cols(df_NHANES, intersec_cols)

df_NEU_log_alod = df_to_log_cols(df_NEU_alod, intersec_cols)
df_DAR_log_alod = df_to_log_cols(df_DAR_alod, intersec_cols)
df_UNM_log_alod = df_to_log_cols(df_UNM_alod, intersec_cols)
df_NHANES_log_alod = df_to_log_cols(df_NHANES_alod, intersec_cols)


In [14]:
frames_names_log, desc_cols_log = desc_4_cohs(df_NEU_log, df_DAR_log, df_UNM_log, df_NHANES_log, 'Summary_Stats_Log_Values_DP_Paper_JGeller')
frames_names_log_alod, desc_cols_log_alod = desc_4_cohs(df_NEU_log_alod, df_DAR_log_alod, df_UNM_log_alod, df_NHANES_log_alod, 'ALOD_Summary_Stats_Log_Values_DP_Paper_JGeller')

In [15]:
df_ALL_log, intersec_cols_log = merge4CohortFrames2(df_NEU_log, df_UNM_log, df_DAR_log, df_NHANES_log)
df_ALL_log_alod, intersec_cols_log_alod = merge4CohortFrames2(df_NEU_log_alod, df_UNM_log_alod, df_DAR_log_alod, df_NHANES_log_alod)


**Reporting of the counts per analyte provided by each cohort**


In [None]:
##returns a dict with count per analyte
def count_dict(frame_name):
    ##build dictionary with values coressponding to each column
    vals = {}
    desc_cols.append('CohortType')
    for col in desc_cols:
        vals[col] = []

    ##report ocunts for each data frame and add values to dictionary
    for tup in frame_name:
        i = 0
        name=tup[1]
        vals["CohortType"].append(name)

        while i < len(tup[0]):
            feature = tup[0].index[i]
            feature_count = tup[0]["count"][i]
            i = i + 1
            vals[feature].append(feature_count)

    ##create dataframe from values in dictionary
    return pd.DataFrame(columns = list(vals.keys()), data = vals)

In [None]:
counts = count_dict(frames_names)
counts_alod = count_dict(frames_names_alod)

In [None]:
# Displaying counts in a dataframe with a gradient based on value
import seaborn as sns
cm = sns.light_palette("blue", as_cmap = True)
counts_df= counts.style.background_gradient(cmap = cm)
counts_df

##TAKE SCREENSHOT

**Making a Histogram of Counts per Analyte**

In [None]:
##creates a seaborn histogram based on a melted df and color scheme
def hist(df_melted, list_colors, fi_name):
    sns.set_style('ticks')
    sns.set(font_scale=1.25)
    
    g = sns.catplot(
        data = df_melted,
        x = 'CohortType', y = 'value',
        col ='variable', kind = 'bar', col_wrap = 5, sharey = False, palette = list_colors
    )
    # iterate through axes and set bar label as number of datapoints
    for ax in g.axes.ravel():

        # add annotations
        for c in ax.containers:
            labels = [f'{(v.get_height()):.0f}' for v in c]
            ax.bar_label(c, labels=labels, label_type = 'edge')
        ax.margins(y = 0.2)

    g.savefig(fi_name, format ='jpeg', dpi =200)


In [None]:
counts_melted = pd.melt(counts, id_vars = ['CohortType'])
counts_melted_alod = pd.melt(counts_alod, id_vars = ['CohortType'])

In [None]:
def rename_var(x):
    if x == 'UPB':
        return 'Lead'
    elif x == 'UMO':
        return 'Molybdenum'
    elif x =='UCO':
        return 'Cobalt'
    elif x =='UCS':
        return 'Cesium'
    elif x =='UHG':
        return 'Mercury'
    
    elif x =='UCD':
        return 'Cadmium'
    elif x =='UTL':
        return 'Thallium'
    elif x =='USB':
        return 'Antimony'
    elif x =='UTAS':
        return 'Tin'
    elif x =='UBA':
        return 'Barium'
    elif x =='UMN':
        return 'Manganese'
    elif x =='UTU':
        return 'Tungsten'
    else:
        return 'NA'


In [None]:
#  TODO some columns in BLOD have more values than 1.0, or 0.0 (ex df_NEU_bloc['UCD'] has 0.5 as value)
df_NEU_blod['UCD'].unique()

In [None]:
counts_melted['variable'] = counts_melted['variable'].apply(rename_var)
counts_melted_alod['variable'] = counts_melted_alod['variable'].apply(rename_var)

In [None]:
# TODO: inccorect numbers for ALOD, protext is almost 0 for alod and rest of cohorts are identical numbers
# ALOD vs normal df
hist(counts_melted,  ['red', 'green', 'gray', 'blue'], 'Analyte_Counts_Hist_DP_Paper.jpg')
hist(counts_melted_alod,  ['red', 'green', 'gray', 'blue'], 'ALOD_Analyte_Counts_Hist_DP_Paper.jpg')

**Creating LOD Ratio graphs**

In [None]:
## returns if string is substring in any element of list
def list_contains(sub, lst):
    answer = False
    for col in lst:
        answer = answer or (col in lst)
    return answer

In [None]:
##round the columns of the df to specified number of digits
def round_cols_float(df, dec_places):
    df_temp =df.copy()
    for col in df_temp.columns:
        if col not in ['PIN_Patient', 'TimePeriod', 'CohortType']:
            df_temp[col] =df_temp[col].apply(lambda x: round(x, dec_places))
    return df_temp

In [None]:
##helper to build df for lod
def lod_helper(df, intersec, coh_name, cohs_acc, var_acc, val_acc, denom_acc, col_ending):
    for col in intersec:
        if col +col_ending in df.columns:
            cohs_acc.append(coh_name)
            var_acc.append(col )
            val_acc.append(len(df[df[col + col_ending] == 0.0][col +col_ending]))
            denom_acc.append(len(df[col+col_ending]) - sum(df[col + col_ending].isna()))
    return cohs_acc, var_acc, val_acc, denom_acc

In [None]:

cohs = []
var = []
val = []

cohs_acc, var_acc, val_acc, denom_acc = lod_helper(df_NEU_blod, intersec_cols, NEU_alias, [], [], [], [], '' )
cohs_acc, var_acc, val_acc, denom_acc = lod_helper(df_DAR_blod, intersec_cols, DAR_alias, cohs_acc, var_acc, val_acc, denom_acc, '_BLOD')
cohs_acc, var_acc, val_acc, denom_acc = lod_helper(df_UNM_blod, intersec_cols, UNM_alias, cohs_acc, var_acc, val_acc, denom_acc, '')
df_NHANES_blod_float = round_cols_float(df_NHANES_blod, 2)


cohs_acc, var_acc, val_acc, denom_acc = lod_helper(df_NHANES_blod_float, intersec_cols, NHANES_alias, cohs_acc, var_acc, val_acc, denom_acc, '')


blod_counts_melted ={'CohortType':  cohs_acc, 'variable': var_acc, 'value': val_acc, 'N' : denom_acc}
blod_df = pd.DataFrame(data = blod_counts_melted)

In [None]:
##modifying blod_df to be percent of 0.0 values over all non-na values
blod_df_ratio = blod_df.copy()
blod_df_ratio['value'] = round(blod_df_ratio['value']/blod_df_ratio['N'],2)
m_blod_df_ratio = blod_df_ratio[~blod_df_ratio['variable'].isin(['CohortType', 'PIN_Patient', 'TimePeriod'])]

In [None]:
m_blod_df_ratio['variable']=m_blod_df_ratio['variable'].apply(rename_var)

In [None]:
sns.set_style('ticks')

g = sns.catplot(
    data = m_blod_df_ratio,
    x = 'CohortType', y = 'value',
    col ='variable', kind = 'bar', col_wrap = 5, sharey = False, palette = ['red', 'green', 'gray', 'blue']
)
# iterate through axes and set bar label as number of datapoints
for ax in g.axes.ravel():

    # add annotations
    for c in ax.containers:
        labels = [f'{(v.get_height()):.2f}' for v in c]
        ax.bar_label(c, labels=labels, label_type = 'edge')
    ax.margins(y = 0.2)

g.savefig('Above_LOD_Ratio_Hist_DP_Paper.jpg', format ='jpeg', dpi =1000)

**Creating Graph of Distributions**

In [None]:
##creating dataframe to create a boxplot
def long_on_analyte(df_neu, df_dar, df_unm, df_nhanes, intersec_cols):
    cols = intersec_cols.copy()
    cols.remove('PIN_Patient')
    cols.remove('TimePeriod')
    neu_ints_melt = pd.melt(df_neu[cols], id_vars = "CohortType")
    dar_ints_melt = pd.melt(df_dar[cols], id_vars = "CohortType")
    unm_ints_melt = pd.melt(df_unm[cols], id_vars = "CohortType")
    nhanes_ints_melt = pd.melt(df_nhanes[cols], id_vars = "CohortType")
    return pd.concat([neu_ints_melt, dar_ints_melt, unm_ints_melt, nhanes_ints_melt], axis = 0)

In [None]:
all_combined_stacks_melts = long_on_analyte(df_NEU, df_DAR, df_UNM, df_NHANES, intersec_cols)
all_combined_stacks_melts_alod = long_on_analyte(df_NEU_alod, df_DAR_alod, df_UNM_alod, df_NHANES_alod, intersec_cols)

In [None]:
import math
i =0
while i < len(all_combined_stacks_melts):
    if type(all_combined_stacks_melts.iloc[i]['variable']) != str:
        all_combined_stacks_melts.at[i, 'variable'] = all_combined_stacks_melts.iloc[i]['Analyte'] 
    i = i+1
m_all_combined_stacks_melts = all_combined_stacks_melts[['CohortType', 'value', 'variable']]

import math
i =0
while i < len(all_combined_stacks_melts_alod):
    if type(all_combined_stacks_melts_alod.iloc[i]['variable']) != str:
        all_combined_stacks_melts_alod.at[i, 'variable'] = all_combined_stacks_melts_alod.iloc[i]['Analyte'] 
    i = i+1
m_all_combined_stacks_melts_alod = all_combined_stacks_melts_alod[['CohortType', 'value', 'variable']]

In [None]:
m_all_combined_stacks_melts.head(5)

In [None]:
m_all_combined_stacks_melts_alod.head(5)

In [None]:
m_all_combined_stacks_melts['variable']=m_all_combined_stacks_melts['variable'].apply(rename_var)
m_all_combined_stacks_melts_alod['variable']=m_all_combined_stacks_melts_alod['variable'].apply(rename_var)

In [None]:
all_combined_stacks_melts_log = all_combined_stacks_melts.copy()
all_combined_stacks_melts_log = long_on_analyte(df_NEU_log, df_DAR_log, df_UNM_log, df_NHANES_log, intersec_cols)
all_combined_stacks_melts_log['variable']=all_combined_stacks_melts_log['variable'].apply(rename_var)

import math
i =0
while i < len(all_combined_stacks_melts_log):
    if type(all_combined_stacks_melts_log.iloc[i]['variable']) != str:
        all_combined_stacks_melts_log.at[i, 'variable'] = all_combined_stacks_melts_log.iloc[i]['Analyte'] 
    i = i+1
m_all_combined_stacks_melts_log = all_combined_stacks_melts_log[['CohortType', 'value', 'variable']]

# Distribution Plot on log analyte value

g = sns.FacetGrid(m_all_combined_stacks_melts_log, col = 'variable', hue = 'CohortType', col_wrap = 4, sharex = False, sharey = False, palette =  ['red', 'green', 'gray', 'blue'])
p1 = g.map(sns.kdeplot, 'value').add_legend()
p1.savefig('Dist_Plot_Log_Analyte_DP_Paper.jpg', format = 'jpeg', dpi = 200)

In [None]:
#above limit of detection version

all_combined_stacks_melts_log_alod = all_combined_stacks_melts_alod.copy()
all_combined_stacks_melts_log_alod = long_on_analyte(df_NEU_log_alod, df_DAR_log_alod, df_UNM_log_alod, df_NHANES_log_alod, intersec_cols)
all_combined_stacks_melts_log_alod['variable']=all_combined_stacks_melts_log_alod['variable'].apply(rename_var)

import math
i =0
while i < len(all_combined_stacks_melts_log_alod):
    if type(all_combined_stacks_melts_log_alod.iloc[i]['variable']) != str:
        all_combined_stacks_melts_log_alod.at[i, 'variable'] = all_combined_stacks_melts_log_alod.iloc[i]['Analyte'] 
    i = i+1
m_all_combined_stacks_melts_log_alod = all_combined_stacks_melts_log_alod[['CohortType', 'value', 'variable']]

# Distribution Plot on log analyte value

g = sns.FacetGrid(m_all_combined_stacks_melts_log_alod, col = 'variable', hue = 'CohortType', col_wrap = 4, sharex = False, sharey = False, palette =  ['red', 'green', 'gray', 'blue'])
p1 = g.map(sns.kdeplot, 'value').add_legend()
p1.savefig('ALOD_Dist_Plot_Log_Analyte_DP_Paper.jpg', format = 'jpeg', dpi = 200)

**LOD: Percent of Detects that Fall in Max-Min LOD per Analyte**

In this section, a graph is produced that displays the percent of lod's that fall into the range of the max-min lod per detect.

In [None]:
lods = pd.read_csv('LOD_ranges.csv')
lods

In [None]:
all_analytes = m_all_combined_stacks_melts.copy()
all_analytes = all_analytes[all_analytes['variable'].isin(desc_cols)]
all_analytes = all_analytes[all_analytes['variable'].isin(lods['Analyte'].values)]
all_analytes = all_analytes.reset_index(drop = True)
all_analytes['In_LOD_Range'] = 0
##for i = range(0, len(m_all_combined_stacks_melts)-1, 1):
i = 0
while i <len(all_analytes):
    var = all_analytes.iloc[i]['variable']
    value = all_analytes.iloc[i]['value']
    var_min_lod = lods[lods['Analyte']==var]['Min_LOD'].values[0]
    var_max_lod = lods[lods['Analyte']==var]['Max_LOD'].values[0]
    if value>=var_min_lod and value<=var_max_lod:
        all_analytes.at[i, 'In_LOD_Range']=1
    i = i+1

In [None]:
coh = []
analyte = []
in_range = []
n = []
cohorts = [NEU_alias, UNM_alias, DAR_alias, NHANES_alias]
for a in all_analytes['variable'].unique():
    for c in cohorts:
        t = all_analytes[all_analytes['CohortType']==c]
        t = t[t['variable']==a]
        coh.append(c)
        analyte.append(a)
        in_range.append(len(t[t['In_LOD_Range']==1]))
        n.append(len(t[t['In_LOD_Range']==1])+len(t[t['In_LOD_Range']==0]))
lods_in_range_df = pd.DataFrame(data = {'Cohort' : coh, 'Analyte' : analyte, 'In_LOD_Range' : in_range, 'N' : n})

In [None]:
lods_in_range_df['Pct_In_Range'] = round(lods_in_range_df['In_LOD_Range']/lods_in_range_df['N'],2)
lods_in_range_df

In [None]:
sns.set_style('ticks')

g = sns.catplot(
    data = lods_in_range_df,
    x = 'Cohort', y = 'Pct_In_Range',
    col ='Analyte', kind = 'bar', col_wrap = 5, sharey = False, palette = ['red', 'green', 'gray', 'blue']
)
# iterate through axes and set bar label as number of datapoints
for ax in g.axes.ravel():

    # add annotations
    for c in ax.containers:
        labels = [f'{(v.get_height()):.2f}' for v in c]
        ax.bar_label(c, labels=labels, label_type = 'edge')
    ax.margins(y = 0.2)

g.savefig('Percent_LODS_In_Range_Ratio_Hist_DP_Paper.jpg', format ='jpeg', dpi =1000)

In [None]:
##checking for PROTECT for USB
d = all_combined_stacks_melts[all_combined_stacks_melts['CohortType']=='PROTECT']
d = d[d['Analyte']=='USB']
print('in range:', len(d[d['value'].between(0.001000, 0.40)]))
print('total:', len(d))
print('in range version 2:',len(df_NEU[df_NEU['USB'].between(0.001000, 0.4)]))

**Creating Boxplots**

In [None]:
all_combined_stacks_melts['variable']=all_combined_stacks_melts['variable'].apply(rename_var)
all_combined_stacks_melts_alod['variable']=all_combined_stacks_melts_alod['variable'].apply(rename_var)

In [None]:
NHANES_alias

In [None]:
all_combined_stacks_melts['CohortType'].unique()

In [None]:
##boxplopts on analyte value
sns.set_style('whitegrid')

plot_df = all_combined_stacks_melts[all_combined_stacks_melts['CohortType'].isin([NEU_alias,DAR_alias,UNM_alias, NHANES_alias])]
g = sns.catplot(
    data = plot_df,
    x = 'CohortType', y ='value',
    col ='variable', kind ='box', col_wrap = 5, sharey = False, palette =  ['red', 'green', 'gray', 'blue'])

# set rotation
g.set_xticklabels(rotation=45)

g.savefig('Boxplots_DP_Paper.jpg', format = 'jpeg', dpi = 500)

In [None]:
##boxplopts on analyte value
sns.set_style('whitegrid')

plot_df = all_combined_stacks_melts_alod[all_combined_stacks_melts_alod['CohortType'].isin([NEU_alias,DAR_alias,UNM_alias, NHANES_alias])]
g = sns.catplot(
    data = plot_df,
    x = 'CohortType', y ='value',
    col ='variable', kind ='box', col_wrap = 5, sharey = False, palette =  ['red', 'green', 'gray', 'blue'])

# set rotation
g.set_xticklabels(rotation=45)

g.savefig('ALOD_Boxplots_DP_Paper.jpg', format = 'jpeg', dpi = 500)

In [None]:
plot_df['CohortType'].unique()

**One-Way Anova of Geometric Means**

Meeting assumptions

1. Normally distributed: over 30 datapoints so Central Limit Theorem says they are normally distributed
2. Independent Groups: met by structure of data
3. Homogeneity of Variances (equal variances): will be tested for each analyte group

Source: https://online.stat.psu.edu/stat200/lesson/10/10.2

In [None]:
import statistics as stat
from scipy.stats import f_oneway
from statistics import variance

##get colors to print in color
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

##conduct a one way anova of all analytes
def one_way_anova(df_all, fi_name, intersec):
    ##define lists that report results of anova
    analytes = []
    assumptions = []
    p_vals = []
    sig = []
    variances = []

    for col in intersec:
        if col not in ['TimePeriod', 'CohortType', 'PIN_Patient']:
            analytes.append(col)
            ##get series of analyte values w/out NaN
            NEU = df_ALL[df_ALL['CohortType'] == NEU_alias][col].dropna()
            UNM = df_ALL[df_ALL['CohortType'] == UNM_alias][col].dropna()
            DAR = df_ALL[df_ALL['CohortType'] == DAR_alias][col].dropna()
            NHANES = df_ALL[df_ALL['CohortType'] == NHANES_alias][col].dropna()
            var = [round(variance(NEU),3), round(variance(DAR),3), round(variance(UNM),3), round(variance(NHANES),3)]
            variances.append(var)
            ##if equal variances, conduct anova
            ##if round(variance(NEU), 2) == round(variance(UNM), 2) == round(variance(DAR), 2) == round(variance(NHANES), 2):
            if max(var) / min(var) < 2:
                print(bcolors.OKGREEN + col, "passes assumptions" + bcolors.OKGREEN, '\n')
                assumptions.append(True)

            else:                     
                print(bcolors.FAIL + col, "fails: unequal variances" + bcolors.FAIL)
                assumptions.append(False)
                    #perform one-way ANOVA
            p = f_oneway(NEU , UNM ,DAR , NHANES).pvalue
            p_vals.append(p)
            if p > 0.05:
                sig.append(False)
            else:
                sig.append(True)
    ##make df from results of anova
    data = {'Analytes' : analytes, 'Passes Equal Variance Assumption' : assumptions, 'P-Value' : p_vals, 
            'Significant' : sig, 'Variance' : variances}
    res_df = pd.DataFrame(data)
    res_df.to_csv(fi_name, index = False)
    return res_df

In [None]:
anova_df = one_way_anova(df_ALL, 'one_way_anova_results.csv', intersec_cols)
anova_df_log = one_way_anova(df_ALL_log, 'log_one_way_anova_results.csv', intersec_cols_log)

In [None]:
anova_df_alod = one_way_anova(df_ALL_alod, 'ALOD_one_way_anova_results.csv', intersec_cols)
anova_df_log_alod = one_way_anova(df_ALL_log_alod, 'ALOD_log_one_way_anova_results.csv', intersec_cols_log)

In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
##post hoc tests on analytes that were found to be significantly different by ANOVA
##if p < 0.05,there is a statistically significant difference in means
def tukey(df_neu, df_dar, df_unm, df_nhanes, cols_to_test, fi_name):
    for col in cols_to_test:
        NEU = df_ALL[df_ALL['CohortType']==NEU_alias].dropna(subset = [col])
        DAR = df_ALL[df_ALL['CohortType']==DAR_alias].dropna(subset = [col])
        UNM = df_ALL[df_ALL['CohortType']==UNM_alias].dropna(subset = [col])
        NHANES = df_ALL[df_ALL['CohortType']==NHANES_alias].dropna(subset = [col])
        df = pd.concat([NEU, DAR, UNM, NHANES])
        tukey = pairwise_tukeyhsd(endog = df[col],
                                  groups = df['CohortType'],
                                  alpha=0.05)
        file_name = fi_name
        try:
            f = open(file_name, 'x')
            f = open(file_name, 'x')

        except:
            f = open(file_name, 'a')
            f.write(col + ' Tukey Result: \n')
            f.write(str(tukey))
            f.write('\n\n')

        f.close()
    return print(str(tukey))

In [None]:
cols_to_test = anova_df['Analytes']
tukey(df_NEU, df_DAR, df_UNM, df_NHANES, cols_to_test, 'tukey_results.csv')

cols_to_test_log = anova_df_log['Analytes']
tukey(df_NEU_log, df_DAR_log, df_UNM_log, df_NHANES_log, cols_to_test_log, 'log_tukey_results.csv')

cols_to_test_alod = anova_df['Analytes']
tukey(df_NEU_alod, df_DAR_alod, df_UNM_alod, df_NHANES_alod, cols_to_test_alod, 'ALOD_tukey_results.csv')

cols_to_test_log_alod = anova_df_log_alod['Analytes']
tukey(df_NEU_log_alod, df_DAR_log_alod, df_UNM_log_alod, df_NHANES_log_alod, cols_to_test_log_alod, 'alod_log_tukey_results.csv')

**Kolmogorov-Smirnov Test**

Source: https://www.itl.nist.gov/div898/handbook/eda/section3/eda35g.htm

Note: Has this assumption been met - "Perhaps the most serious limitation is that the distribution must be fully specified. That is, if location, scale, and shape parameters are estimated from the data, the critical region of the K-S test is no longer valid. It typically must be determined by simulation."

In [None]:
##determining if two samples came from the same distribution
##determining what distribution a sample follows
def ks(df_all, col_to_test, fi_name):
    from scipy.stats import ks_2samp
    cohort1 = []
    cohort2 = []
    analyte = []
    p_vals = []
    
    for col in cols_to_test:
        for coh1 in [NEU_alias,UNM_alias, DAR_alias, NHANES_alias]:
            for coh2 in  [NEU_alias,UNM_alias, DAR_alias, NHANES_alias]:
                if coh1 != coh2:
                    ##p < 0.05, data does NOT follow that distribution
                    ##alternative option set to two sided means null hypothesis is that d1 = d2
                    p_val = ks_2samp(df_ALL[df_ALL['CohortType'] == coh1].dropna(subset = [col])[col],
                                   df_ALL[df_ALL['CohortType'] == coh2].dropna(subset = [col])[col]).pvalue
                    cohort1.append(coh1)
                    cohort2.append(coh2)
                    p_vals.append(p_val)
                    analyte.append(col)
    res_df = pd.DataFrame(data = {'Cohort1' : cohort1, 'Cohort2' : cohort2, 'Analyte' : analyte, 'P-Value' : p_vals})
    res_df['distribution matches'] = res_df['P-Value'].map(lambda p : p > 0.05 )
    res_df.to_csv(fi_name, index = False)
    return res_df


In [None]:
df_ks = ks(df_ALL, cols_to_test, 'ks_test_results.csv')
df_ks_log = ks(df_ALL_log, cols_to_test, 'log_ks_test_results.csv')

df_ks_alod = ks(df_ALL_alod, cols_to_test_alod, 'ALOD_ks_test_results.csv')
df_ks_log_alod = ks(df_ALL_log_alod, cols_to_test_alod, 'ALOD_log_ks_test_results.csv')