In [1]:
# Required to access the database
import os
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

# Data analysis tools
import pandas as pd
import numpy as np
import seaborn as sns

# Models available in our application
from datasets.models import RawFlower, RawUNM, RawDAR, RawNHANES_BIO
from django.contrib.auth.models import User
from datasets.models import RawDictionary


from datasets.models import RawNEU
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels

from api import adapters



In [2]:
def merge3CohortFrames2(df1,df2,df3):
    'merge on feature intersections'

    for as_feature in ['UASB', 'UDMA', 'UAS5', 'UIAS', 'UAS3', 'UMMA']:
        if as_feature not in df1.columns:
            df1[as_feature] = np.nan
        if as_feature not in df2.columns:
            df2[as_feature] = np.nan
        if as_feature not in df3.columns:
            df3[as_feature] = np.nan

    s1 = set(df1.columns)
    s2 = set(df2.columns)
    s3 = set(df3.columns)

    cc = set.intersection(s1, s2, s3)

    df_all = pd.concat([df1[cc],df2[cc],df3[cc]])

    return df_all

In [3]:
## Get the data

## Get NEU data with no fish
df_NEU = adapters.neu.get_dataframe_orig()
df_NEU = df_NEU[df_NEU['TimePeriod']==2] # Visit 2

df_NEU_blod = adapters.neu.get_dataframe_BLOD()

df_NEU_covars = adapters.neu.get_dataframe_covars()
df_NEU = df_NEU_covars.merge(df_NEU, on = ['PIN_Patient','CohortType','TimePeriod']) #Merge the covariates

## Get DAR data
## df_DAR = adapters.unm.get_dataframe_orig()

df_DAR = df_NEU.copy()
df_DAR_blod = df_NEU_blod.copy()
df_DAR['CohortType'] = 'DAR'

## Get UNM data
'''
df_UNM = adapters.unm.get_dataframe_orig()
df_UNM_covars = adapters.unm.get_dataframe_covars()
df_UNM = df_UNM_covars.merge(df_UNM, on = ['PIN_Patient','CohortType','TimePeriod']) #Merge the covariates
'''

df_UNM = df_NEU.copy()
df_UNM_blod = df_NEU_blod.copy()
df_UNM['CohortType'] = 'UNM'

## get NHANES data

df_NHANES = adapters.nhanes.get_dataframe_orig()
df_NHANES_blod = adapters.nhanes.get_dataframe_orig_blod()
df_NHANES_covars = adapters.nhanes.get_dataframe_covars()

## JULIA: You will need another function that merges 4 data frames
df_ALL = merge3CohortFrames2(df_NEU, df_UNM, df_DAR)


## df_ALL = analysis.merge3CohortFrames(df_NEU, df_UNM, df_DAR)
frames_for_analysis = [
    ('NEU', df_NEU),
    ('UNM', df_UNM),
    ('DAR', df_DAR),
    ('ALL', df_ALL)

]

for name, df in frames_for_analysis:
    print('Data Stats')
    print(name)
    print(df.shape)

Data Stats
NEU
(570, 55)
Data Stats
UNM
(570, 55)
Data Stats
DAR
(570, 55)
Data Stats
ALL
(1710, 55)


In [9]:
## Only 247 pregnant woman in all of NHANES?

df_NHANES_covars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 247 entries, 198 to 57301
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PIN_Patient  247 non-null    int64  
 1   Age          247 non-null    int64  
 2   TimePeriod   247 non-null    object 
 3   Pregnant     247 non-null    float64
 4   Marital      247 non-null    float64
 5   Child_A      247 non-null    int64  
 6   Child_B      247 non-null    int64  
 7   H_Inc        237 non-null    float64
 8   F_Inc        237 non-null    float64
 9   Edu          247 non-null    float64
 10  Rac          247 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 23.2+ KB


In [5]:
## Comments:
## While you have 243 total participants, it varies per analyte.
## UALB_mg is not a harmonized variable: #df['UALB'] = df['UALB_mg'] * converversion
## UCRT_mg is not a harmonized variable
## UCRT_unmol is not a harmonized varaible




df_NHANES_blod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 22 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PIN_Patient  243 non-null    int64  
 1   TimePeriod   243 non-null    object 
 2   UALB_mg      186 non-null    float64
 3   UALB_ug      186 non-null    float64
 4   UBA          80 non-null     float64
 5   UCD          80 non-null     float64
 6   UCO          80 non-null     float64
 7   UCR          20 non-null     float64
 8   UCRT_mg      121 non-null    float64
 9   UCRT_umol    121 non-null    float64
 10  UCS          80 non-null     float64
 11  UHG          80 non-null     float64
 12  UI           42 non-null     float64
 13  UMN          80 non-null     float64
 14  UMO          80 non-null     float64
 15  UNI          20 non-null     float64
 16  UPB          80 non-null     float64
 17  USB          80 non-null     float64
 18  USN          80 non-null     float64
 19  UTL     

In [6]:
df_NHANES_blod.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 22 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PIN_Patient  243 non-null    int64  
 1   TimePeriod   243 non-null    object 
 2   UALB_mg      186 non-null    float64
 3   UALB_ug      186 non-null    float64
 4   UBA          80 non-null     float64
 5   UCD          80 non-null     float64
 6   UCO          80 non-null     float64
 7   UCR          20 non-null     float64
 8   UCRT_mg      121 non-null    float64
 9   UCRT_umol    121 non-null    float64
 10  UCS          80 non-null     float64
 11  UHG          80 non-null     float64
 12  UI           42 non-null     float64
 13  UMN          80 non-null     float64
 14  UMO          80 non-null     float64
 15  UNI          20 non-null     float64
 16  UPB          80 non-null     float64
 17  USB          80 non-null     float64
 18  USN          80 non-null     float64
 19  UTL     