# Harmonizing analytes, creating visualizations, running analysis 

The goal of this notebook is to find the interesected features between all 4 cohorts, create graphs visualizing the distirubtions of the analytes, and anlyze those distributions. 

Author: Julia Geller

Last Edit: 03/06/2022

### Setup

This notebook explains how to to access the dataset models from django.
The datasets are loaded from the Postgres database into pandas dataframes.

To start the notebook server:

```
# Start a bash shell inside the api container
docker-compose exec api /bin/bash

# Start the jupyter notebook
python manage.py shell_plus --notebook

# Take note of the token provided and access the notebook through:
<ip-of-the-machine>:7777/?<token>
http://127.0.0.1:7777/?token=30c43675981e671b4a609cff470819098e274bbde415b7f5
```

This step has only to be made once as long as the jupyter-notebook keeps executing.
Considering that the the notebook continues to run, you can access the notebook using:
``` 
<ip-of-the-machine>:7777/?<token>
```

In [1]:
# Required to access the database
import os
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

# Data analysis tools
import pandas as pd
import numpy as np
import seaborn as sns

# Models available in our application
from datasets.models import RawFlower, RawUNM, RawDAR
from django.contrib.auth.models import User

In [2]:
from api import adapters
from api import analysis



### Transforming NHANES from Uploaded Version

In [9]:
## zlatan, my issue is here. both df's dataframe is not correclty imported. There
##Are null values for nahens_df where there should not be and 
##non 1/0 values in BLOD where there should be
from datasets.models import RawNHANES_BIO
nhanes_df = adapters.nhanes.get_dataframe_orig()
nhanes_df_blod=adapters.nhanes.get_dataframe_orig_blod()

nhanes_df.shape

(31608, 23)

In [None]:
##blod mapper is a dummy function, this fucn should not need to exist bc BLOD houldonly be 1 annd 0
def blod_mapper(n):
    if n >0.5 and n<=1:
        return 1
    if n< 0.5:
        return 0
    else: 
        return 9999

# This queries the RawNEU dataset and excludes some of the values
df = pd.DataFrame.from_records(
    RawNHANES_BIO.objects.
    # exclude(Creat_Corr_Result__lt=-1000).
    # exclude(Creat_Corr_Result__isnull=True).
    values()
)
print(df.columns)
df['Blod']=df['Blod'].fillna(9999)

df['Blod']=df['Blod'].map(blod_mapper)
df['Blod']=df['Blod'].astype(int)
df['Blod']=df['Blod'].replace(9999, np.NaN)

## new covariates
df['Member_c'] = 1
df.columns = ['id', 'PIN_Patient', 'Age', 'TimePeriod', 'Pregnant', 'Marital',
   'Child_A', 'Child_B', 'H_Inc', 'F_Inc', 'Edu', 'Rac', 'BLOD',
   'Result', 'Analyte','Member_c']
#ga at collection

# Pivoting the table and reseting index
numerical_values = ['Result']

columns_to_indexes = ['PIN_Patient', 'TimePeriod', 'Member_c' ]
categorical_to_columns = ['BLOD']

df = pd.pivot_table(df, values=numerical_values,
                    index=columns_to_indexes,
                    columns=categorical_to_columns)

df = df.reset_index()


# TODO - Should we drop NaN here?

# After pivot
# Analyte     TimePeriod Member_c       BCD  ...      UTMO       UTU       UUR
# PIN_Patient                                ...
# A0000M               1        1  1.877245  ...  0.315638  1.095520  0.424221
# A0000M               3        1  1.917757  ...  0.837639  4.549155  0.067877
# A0001M               1        1  1.458583  ...  0.514317  1.262910  1.554346
# A0001M               3        1  1.365789  ...  0.143302  1.692582  0.020716
# A0002M               1        1  1.547669  ...  0.387643  0.988567  1.081877

df['CohortType'] = 'NHANES'
#df['TimePeriod'] = pd.to_numeric(df['TimePeriod'], errors='coerce')

df.head(5)

# Get Dataframes

In [None]:
##commented out for testing purposes
""" from datasets.models import RawNEU
df = pd.DataFrame.from_records(
        RawNEU.objects.
        # exclude(Creat_Corr_Result__lt=-1000).
        # exclude(Creat_Corr_Result__isnull=True).
        values()
    )

"""

#adapters.neu.get_dataframe()
neu_df = adapters.neu.get_dataframe()
neu_df_blod=adapters.neu.get_dataframe_BLOD()

##making mock df
for col in ['UBA','UBE', 'UCD', 'UCO', 'UCR', 'UCS', 'UCU', 'UHG', 'UMN', 'UMO', 'UNI',
            'UPB', 'UPT', 'USB', 'USE', 'USN', 'UTAS', 'UTL', 'UTU', 'UUR', 'UVA',
            'UZN']:
    neu_df_blod[col]=np.random.randint(0,2,size=(1, 2200))[0]

##unm_df = adapters.unm.get_dataframe_orig()
##unm_df_blod=adapters.unm.get_dataframe_orig_BLOD()

unm_df=neu_df
unm_df_blod=neu_df_blod

##DELETE once have real dar data
dar_df=neu_df
dar_df_blod=neu_df_blod

##dar_df = adapters.dar.get_dataframe()
####dar_df_blod = adapters.dar.get_dataframe_BLOD()



In [None]:
neu_df.head()

In [None]:
nhanes_df_blod.head()

In [None]:
neu_df_blod.head()

### Intersected Features 

In [None]:
##returns list of biometric samples column names present in df 
list_notin=['PIN_Patient', 'TimePeriod', 'Member_c', 'Outcome', 'Outcome_weeks',
       'age', 'ethnicity', 'race', 'education', 'BMI', 'income', 'smoking',
       'parity', 'preg_complications', 'folic_acid_supp', 'fish', 'babySex',
       'birthWt', 'headCirc', 'birthLen', 'WeightCentile', 'LGA', 'SGA',
       'ga_collection', 'creatininemgdl_x', 'birth_year', 'CohortType', 'original',
       'prediction', 'prediction_xvalue', 'original_xvalue',
       'creatininemgdl_y', 'zscore', 'Cohort', 'dil_indicator','fish_pu_v2','SPECIFICGRAVITY_V2_x',
            'SPECIFICGRAVITY_V2_y' ]
def cat_samples_list(df):
    samples=[]
    for col in df.columns:
        if col not in list_notin:
            samples.append(col)
    return samples
neu_anlaytes_list = cat_samples_list(neu_df)
unm_anlaytes_list = cat_samples_list(unm_df)
dar_anlaytes_list = cat_samples_list(dar_df)
nhanes_anlaytes_list = cat_samples_list(nhanes_df)

neu_unm = set.intersection(set(neu_anlaytes_list), set(unm_anlaytes_list))
neu_unm_dar = list(set.intersection(set(neu_unm), set(dar_anlaytes_list)))
all_cohorts = list(set.intersection(set(neu_unm_dar), set(nhanes_anlaytes_list)))

len(all_cohorts)


In [None]:
print(all_cohorts)

### Creating a DF with intersected features

##### Can you also rewrite the function to see what is harmonized across two cohorts?


In [None]:
if 'PIN_Patient' not in all_cohorts:
    all_cohorts.insert(0,"PIN_Patient")

 
print("Cohort(s) : Total Data Size (for interescted features)")

neu_intersected=neu_df[all_cohorts]
neu_intersected["CohortType"]="NEU"
neu_size=neu_intersected.shape[0]
print("NEU : " +format(neu_size, "0.0f"))


unm_intersected=unm_df[all_cohorts]
unm_intersected["CohortType"]="UNM"
unm_size=unm_intersected.shape[0]
print("UNM : " + format(unm_size,"0.0f"))

dar_intersected=dar_df[all_cohorts]
dar_intersected["CohortType"]="DAR"
dar_size=dar_intersected.shape[0]
print("DAR : " + format(unm_size,"0.0f"))

nhanes_intersected=nhanes_df[all_cohorts]
nhanes_intersected["CohortType"]="NHANES"
nhanes_size=nhanes_intersected.shape[0]
print("NHANES : " + format(nhanes_size,"0.0f"))

if 'CohortType' not in all_cohorts:
    all_cohorts.insert(0,"CohortType")

In [None]:

       'age', 'ethnicity', 'race', 'BMI', 'smoking', 'parity',
       'preg_complications', 'folic_acid_supp', 'fish', 'babySex', 'birthWt',
       'birthLen', 'headCirc', 'WeightCentile', 'LGA', 'SGA', 'ga_collection',
       'education', 'birth_year', 'SPECIFICGRAVITY_V2', 'fish_pu_v2', 'UBA',
       'UBE', 'UCD', 'UCO', 'UCR', 'UCS', 'UCU', 'UHG', 'UMN', 'UMO', 'UNI',
       'UPB', 'UPT', 'USB', 'USE', 'USN', 'UTAS', 'UTL', 'UTU', 'UUR', 'UVA',
       'UZN', 'CohortType'],

In [None]:
nhanes_df.columns

In [None]:
# 1) 
neu_df[all_cohorts].describe().transpose()

##truncates column values
def clean_cols(DF_desc):
    DF_desc["count"]=DF_desc["count"].astype(int)
    DF_desc["mean"]=round(DF_desc["mean"],2)
    DF_desc["std"]=round(DF_desc["std"],2)
    DF_desc["min"]=round(DF_desc["min"],2)
    DF_desc["25%"]=round(DF_desc["25%"],2)
    DF_desc["50%"]=round(DF_desc["50%"],2)
    DF_desc["75%"]=round(DF_desc["75%"],2)
    DF_desc["max"]=round(DF_desc["max"],2)



# NEU
NEU_desc=neu_df[all_cohorts].describe().transpose()
clean_cols(NEU_desc)

# DAR - dummy
DAR_desc=dar_df[all_cohorts].describe().transpose()
clean_cols(DAR_desc)

# UNM - dummy
UNM_desc=unm_df[all_cohorts].describe().transpose()
clean_cols(UNM_desc)

# NHANES - dummy
NHANES_desc=nhanes_df[all_cohorts].describe().transpose()
clean_cols(NHANES_desc)

# Write functions that will iterate and save each summary dataframe to a csv file.
frames_names = [(NEU_desc,"NEU"), (DAR_desc, "DAR"),(UNM_desc,"UNM"),(NHANES_desc,"NHANES")]

file_name = "Summary_Stats_Distribution_Paper_JGeller"


try:
    f = open(file_name+".csv", "x")
    f = open(file_name+".csv", "a")
    
except:
    f = open(file_name+".csv", "a")
    
for tup in frames_names:
    frame=tup[0]
    name=tup[1]
    ##content=string(name ,"/n",frame,"\n\n")
    f.write(name)
    f.write("\n\t")
    f.write(str(frame))
    f.write("\n\n")

f.close()

frames_names

In [None]:
#2) TODO: Create a report of the counts per analyte provided by each cohort
# plot the counts using a this using a histogram

##build dictionary with values corresponding to each column
vals={}
for col in all_cohorts:
    if col=="PIN_Patient":
        None
    else:
        vals[col]=[]

##report counts for each data frame and add values to dictionary
for tup in frames_names:
    i=0
    name=tup[1]
    print('\033[1m'+name+'\033[0m')
    vals["CohortType"].append(name)

    while i<len(tup[0]):
        feature=tup[0].index[i]
        feature_count=tup[0]["count"][i]
        print("   "+feature+": "+format(feature_count,"0.0f"))
        i=i+1
        vals[feature].append(feature_count)


        ##=tup[0]["count"][i]
##create dataframe fro values in dictionary
counts=pd.DataFrame(columns=list(vals.keys()), data=vals)
counts



In [None]:
import seaborn as sns
cm = sns.light_palette("green", as_cmap=True)
# cool feature for data frames if you want to make the dataframe also show a gradient
counts_df=counts.style.background_gradient(cmap=cm)
counts_df

### Matplotlib histogram 


### Seaborn histogram - finalized

Advantages
- Clean
- Simple
- Don't manually add features

Disadvantages

...

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style('whitegrid')
counts_melted=pd.melt(counts, id_vars = ['CohortType'])


g = sns.catplot(
    data=counts_melted,
    x='CohortType', y='value',
    col='variable', kind='bar', col_wrap=5, sharey = False,       
    palette=sns.color_palette(['black', 'green','gray']))


##JAG TODO understand what is happening here
# iterate through axes
for ax in g.axes.ravel():
    
    # add annotations
    for c in ax.containers:
        labels = [f'{(v.get_height()):.0f}' for v in c]
        ax.bar_label(c, labels=labels, label_type='edge')
    ax.margins(y=0.2)



In [None]:
# 3) TODO: Generate boxplot data to visualize individual and combinations of data

import seaborn as sns 
import matplotlib.pyplot as plt
import pandas as pd

##creating dataframe - need to melt eaach combination of df, and then put them into one df
##for below boxplot

neu_ints_melt = pd.melt(neu_intersected_cols.drop('PIN_Patient', axis = 1), id_vars="CohortType")
if "Analyte" in neu_ints_melt.columns:
    neu_ints_melt=neu_ints_melt.rename(columns={"Analyte": "variable"})
    
dar_ints_melt=pd.melt(dar_intersected_cols.drop('PIN_Patient', axis = 1), id_vars="CohortType")
if "Analyte" in dar_ints_melt.columns:
    dar_ints_melt=dar_ints_melt.rename(columns={"Analyte": "variable"})
    
unm_ints_melt = pd.melt(unm_intersected_cols.drop('PIN_Patient', axis = 1), id_vars="CohortType")
if "Analyte" in unm_ints_melt.columns:
    unm_ints_melt=unm_ints_melt.rename(columns={"Analyte": "variable"})
    



all_combined_stacks_melts=pd.concat([neu_ints_melt,unm_ints_melt,dar_ints_melt], axis=0)

print("Checking that melted df has same correct number of values per cohorts")
print("Cohort values: ", all_combined_stacks_melts["CohortType"].unique())
print(all_combined_stacks_melts[all_combined_stacks_melts["CohortType"]=="NEU"].count()[0],"=", neu_size*(len(intersec_3_cohs_list)-2))
print(all_combined_stacks_melts[all_combined_stacks_melts["CohortType"]=="DAR"].count()[0],"=", dar_size*(len(intersec_3_cohs_list)-2))
print(all_combined_stacks_melts[all_combined_stacks_melts["CohortType"]=="UNM"].count()[0],"=", unm_size*(len(intersec_3_cohs_list)-2))
                                                                                                                          

In [None]:
sns.set_style('whitegrid')

plot_df = all_combined_stacks_melts[all_combined_stacks_melts['CohortType'].isin(['NEU','DAR', 'UNM'])]
g = sns.catplot(
    data=plot_df,
    x='CohortType', y='value',
    col='variable', kind='box', col_wrap=5, sharey = False,
    palette=sns.color_palette(['black', 'green','gray']), order=['NEU','DAR', 'UNM']
)

# set rotation
g.set_xticklabels(rotation=90)

ann_text <- data.frame(mpg = 15,wt = 5,lab = "Text",
                       cyl = factor(8,levels = c("4","6","8")))
p + geom_text(data = ann_text,label = "Text")


In [None]:
neu_uasb=plot_df[plot_df["CohortType"]=="NEU"]
neu_uasb=neu_uasb[neu_uasb["variable"]=="UASB"]["value"]

unm_uasb=plot_df[plot_df["CohortType"]=="UNM"]
unm_uasb=unm_uasb[unm_uasb["variable"]=="UASB"]["value"]

dar_uasb=plot_df[plot_df["CohortType"]=="DAR"]
dar_uasb=dar_uasb[dar_uasb["variable"]=="UASB"]["value"]

data = [neu_uasb,dar_uasb, unm_uasb]
fig7, ax7 = plt.subplots()
ax7.set_title('Multiple Samples with Different sizes')
ax7.boxplot(data)

plt.show()
"""

# Create two subplots and unpack the output array immediately
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
ax1.plot(x, y)
ax1.set_title('Sharing Y axis')
ax2.scatter(x, y)

# Create four polar axes and access them through the returned array
fig, axs = plt.subplots(2, 2, subplot_kw=dict(projection="polar"))
axs[0, 0].plot(x, y)
axs[1, 1].scatter(x, y)

# Share a X axis with each column of subplots
plt.subplots(2, 2, sharex='col')

# Share a Y axis with each row of subplots
plt.subplots(2, 2, sharey='row')

# Share both X and Y axes with all subplots
plt.subplots(2, 2, sharex='all', sharey='all')

# Note that this is the same as
plt.subplots(2, 2, sharex=True, sharey=True)

# Create figure number 10 with a single subplot
# and clears it if it already exists.
fig, ax = plt.subplots(num=10, clear=True)
"""

In [None]:

neu_uasb=plot_df[plot_df["CohortType"]=="NEU"]
neu_uasb=neu_uasb[neu_uasb["variable"]=="UASB"]["value"]

unm_uasb=plot_df[plot_df["CohortType"]=="UNM"]
unm_uasb=unm_uasb[unm_uasb["variable"]=="UASB"]["value"]

dar_uasb=plot_df[plot_df["CohortType"]=="DAR"]
dar_uasb=dar_uasb[dar_uasb["variable"]=="UASB"]["value"]

data = [neu_uasb,dar_uasb, unm_uasb]



In [None]:
import math
features=intersec_3_cohs_list[2:]
fig, axs = plt.subplots(math.ceil(math.sqrt(i)), math.ceil(math.sqrt(i)))
i=1
r=0
c=0
while i<=len(features):
    while r<=math.ceil(math.sqrt(i)):
        while c<=math.ceil(math.sqrt(i)):
            axs[r, c].boxplot(data)
            axs[r, c].set_title('Axis ['+ str(r)+ " , " +str(c) +']')
            c=c+1
            print(str(r), str(c))
        r=r+1
    i=i+1


for ax in axs.flat:
    ax.set(xlabel='x-label', ylabel='y-label')

# Hide x labels and tick labels for top plots and y ticks for right plots.
for ax in axs.flat:
    ax.label_outer()
fig.tight_layout()
    

In [None]:
medians = plot_df.groupby(['CohortType',"variable"])['value'].count()
vertical_offset = plot_df['value'].median() * 0.05 # offset from median for display
print(medians)
for xtick in box_plot.get_xticks():
    box_plot.text(xtick,medians[xtick] + vertical_offset,medians[xtick], 
            horizontalalignment='center',size='x-small',color='w',weight='semibold')
    

In [None]:
# 4) TODO: Generate a visualization also where you overlay the three density plots for each cohort over each other
# If you can add vertical lines in the distribution plot to show the mean.

g = sns.FacetGrid(plot_df, col='variable', hue='CohortType', col_wrap = 4, sharex = False,palette=sns.color_palette(['black', 'gray','green']))
p1 = g.map(sns.kdeplot, 'value').add_legend()


#example:
#https://stackoverflow.com/questions/41144357/showing-the-mean-line-in-a-density-plot-in-pandas

In [None]:
# 5) TODO: Correlate the 15 harmonized urine values but write your own correlation functions and visualizaions. 
##heatmap of correlation in each cohort 
import seaborn as sns
# Generate correlations for datasets:
# NEU
# DAR
# UNM

features=intersec_3_cohs_list[2:]

fig, ax =plt.subplots(1,3)
fig.set_size_inches(17, 5)

sns.heatmap(neu_df[intersec_3_cohs_list[2:]].corr(), ax=ax[0])
ax[0].set_title('NEU')
##ax[0].set_xticklabels(labels=features,rotation=45, fontsize=5)
##ax[0].set_yticklabels(labels=features,rotation=0)

sns.heatmap(dar_df[intersec_3_cohs_list[2:]].corr(), ax=ax[1])
ax[1].set_title('DAR')
##ax[1].set_xticklabels(labels=features,rotation=45)
##ax[1].set_yticklabels(labels=features,rotation=0)

sns.heatmap(unm_df[intersec_3_cohs_list[2:]].corr(), ax=ax[2])
ax[2].set_title('UNM')
##ax[2].set_xticklabels(labels=features,rotation=45)
##ax[2].set_yticklabels(labels=features,rotation=0)




### BLOD Graphs - Plotting number of detects (0) per Analyte

In [None]:
##creating BLOD df 
BLOD=pd.DataFrame(columns=["CohortType"], data=["NEU", "DAR", "UNM"])

##intesected columns iwth just bio-sample columns
copy_intersec=intersec_3_cohs_list
if "PIN_Patient" in copy_intersec:
    copy_intersec.remove("PIN_Patient")
if "CohortType" in copy_intersec:
    copy_intersec.remove("CohortType")

##filling df with number of 0's per each analyte per cohort
for col in copy_intersec:
    BLOD[col]=[neu_df_blod[neu_df_blod[col]==0].count()[0],
                  dar_df_blod[dar_df_blod[col]==0].count()[0],
                 dar_df_blod[dar_df_blod[col]==0].count()[0]]
BLOD_melted=pd.melt(BLOD, id_vars = ['CohortType'])

##creating barchart from the counts
g = sns.catplot(
    data=BLOD_melted,
    x='CohortType', y='value',
    col='variable', kind='bar', col_wrap=5, sharey = False,       
    palette=sns.color_palette(['black', 'green','gray']))

##adding label with is number of 0's per analyte
for ax in g.axes.ravel():
    
    # add annotations
    for c in ax.containers:
        labels = [f'{(v.get_height()):.0f}' for v in c]
        ax.bar_label(c, labels=labels, label_type='edge')
    ax.margins(y=0.2)



In [None]:

# 6) TODO: Can you write about what you see. Are there any cohorts that are similar with the harmonized data? 
# Are there any analytes that are particularliy higher lower in specific cohorts.


### 7) TODO: Think about if how we could compare similarity between two cohorts?

- Look at the shape of the distributions
- Look at peak of distributions (mode)
- Look at median and compare to dashed line (mean)
