### Setup

This notebook explains how to to access the dataset models from django.
The datasets are loaded from the Postgres database into pandas dataframes.

To start the notebook server:

```
# Start a bash shell inside the api container
docker-compose exec api /bin/bash

# Start the jupyter notebook
python manage.py shell_plus --notebook

# Take note of the token provided and access the notebook through:
<ip-of-the-machine>:7777/?<token>
http://127.0.0.1:7777/?token=30c43675981e671b4a609cff470819098e274bbde415b7f5
```

This step has only to be made once as long as the jupyter-notebook keeps executing.
Considering that the the notebook continues to run, you can access the notebook using:
``` 
<ip-of-the-machine>:7777/?<token>
```

In [1]:
# Required to access the database//
import os
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

# Data analysis tools
import pandas as pd
import numpy as np
import seaborn as sns

# Models available in our application
from datasets.models import RawFlower, RawUNM, RawDAR
from django.contrib.auth.models import User

In [2]:
from api import adapters
from api import analysis

In [3]:
import pandas as pd
import numpy as np

## Get the data

## Get NEU data with no fish
df_NEU = adapters.neu.get_dataframe_orig()
df_NEU = df_NEU[df_NEU['TimePeriod']==2] # Visit 2

df_NEU_covars = adapters.neu.get_dataframe_covars()
df_NEU = df_NEU_covars.merge(df_NEU, on = ['PIN_Patient','CohortType','TimePeriod']) #Merge the covariates
#df_NEU = df_NEU.replace(-9,np.nan).replace('-9', np.nan)
## Get DAR data
df_DAR = adapters.dar.get_dataframe()

#df_DAR = df_NEU.copy()
#df_DAR['CohortType'] = 'DAR'

## Get UNM data with no fis

df_UNM = adapters.unm.get_dataframe_orig()
df_UNM_covars = adapters.unm.get_dataframe_covars()
df_UNM = df_UNM_covars.merge(df_UNM, on = ['PIN_Patient','CohortType','TimePeriod']) #Merge the covariates


## comment these two lines if on live machine
#df_UNM = df_DAR.copy()
#df_UNM['CohortType'] = 'UNM'

#df_UNM = df_NEU.copy()
#df_neu = df_neu.replace(-9, np.nan).replace('-9', np.nan)
##df_unm = df_unm.replace(-9, np.nan).replace('-9', np.nan)
#df_dar = df_dar.replace(-9, np.nan).replace('-9', np.nan)

In [4]:
df_UNM['race']

0      3.0
1      3.0
2      NaN
3      NaN
4      3.0
      ... 
130    3.0
131    3.0
132    3.0
133    3.0
134    3.0
Name: race, Length: 135, dtype: float64

In [5]:
import os
os.getcwd()



'/usr/src/app/jupyter'

In [6]:
df_NEU.shape

(570, 49)

Male/female infants - babySex
Birth weight (g) - birthWt
Gestational age (weeks) - Outcome_weeks
Head circumference (cm) = headCirc
SGA - SGA
Preterm - Outcome
Birth length - birthLen
Maternal age - age
Smoke during pregnancy - smoking
Specific gravity SPECIFICGRAVITY_V2, urine_specigic_gravity
Creatinine - creatininemgl
Maternal BMI - BMI
Parity - parity 
Maternal level of education - educatuib
Total urinary arsenic (µg/L) - UTAS
Summation iAs + MMA + DMA (µg/L) - PropMMAtoiAs + UMMA + DMA
Inorganic arsenic (µg/L) - PropMMAtoiAs
Monomethylarsonic acid (µg/L) -UMMA
Dimethylarsinic acid (µg/L) - UDMA

In [7]:
req = [ 'CohortType', 'PIN_Patient',
'babySex',
'birthWt',
'Outcome_weeks',
'headCirc',
'SGA',
'LGA',
'Outcome',
'birthLen',
'age',
'smoking',
'race',
'SPECIFICGRAVITY_V2',
'urine_specific_gravity',
'creatininemgdl',
'BMI',
'parity',
'education',
'UTAS',
'UAS3',
'UAS5',
'UMMA',
'UDMA']


for col in req:
    if col not in df_NEU:
        df_NEU[col] = np.nan
        
    if col not in df_DAR:
        df_DAR[col] = np.nan
    if col not in df_UNM:
        df_UNM[col] = np.nan
        

df_neu = df_NEU[req]
df_dar = df_DAR[req]
df_unm = df_UNM[req]

In [8]:
covars = ['Outcome_weeks', 
        'age',
        'ethnicity', 
        'race', 
        'BMI', 
        'smoking', 
        'parity', 
        'preg_complications',
        'folic_acid_supp',
        'fish', 
        'babySex', 
        'birthWt', 
        'birthLen', 
        'headCirc',
        'WeightCentile',
        'LGA',
        'SGA',
        'ga_collection','education', 'birth_year', 
        'SPECIFICGRAVITY_V2', 'fish_pu_v2']

In [9]:
dff = pd.concat([df_neu,df_unm,df_dar])


In [10]:
dff.columns

Index(['CohortType', 'PIN_Patient', 'babySex', 'birthWt', 'Outcome_weeks',
       'headCirc', 'SGA', 'LGA', 'Outcome', 'birthLen', 'age', 'smoking',
       'race', 'SPECIFICGRAVITY_V2', 'urine_specific_gravity',
       'creatininemgdl', 'BMI', 'parity', 'education', 'UTAS', 'UAS3', 'UAS5',
       'UMMA', 'UDMA'],
      dtype='object')

In [11]:
dff33 = dff#dff.replace(np.nan, -9)

In [12]:
import pandas as pd

dff2 = dff33.groupby(['CohortType']).count().transpose().reset_index()
dff2['Total'] = dff2['NEU'] + dff2['DAR'] + dff2['UNM']



In [32]:
#dff2
dff2

CohortType,index,DAR,NEU,UNM,Total
0,PIN_Patient,2152,570,135,2857
1,babySex,2133,570,135,2838
2,birthWt,1930,555,135,2620
3,Outcome_weeks,2144,570,135,2849
4,headCirc,2019,512,135,2666
5,SGA,1910,554,135,2599
6,LGA,1910,554,135,2599
7,Outcome,2144,570,135,2849
8,birthLen,2022,537,135,2694
9,age,2152,570,135,2857


In [15]:
dff2.to_csv('countspervariable.csv', index = False)

In [16]:
dff = dff.reset_index(drop=True)

In [33]:
from scipy.stats import gmean, gstd


##continous
contin = ['birthWt','Outcome_weeks','birthLen','age','CohortType',
           'BMI','UTAS','UAS3','UAS5','UMMA','UDMA']
##categorical
categ = ['Outcome','SGA','LGA','smoking','education','babySex','parity','race']
#dff = dff.reset_index()

def q1(x):
    return x.quantile(0.25)

def q2(x):
    return x.median()

def q3(x):
    return x.quantile(0.75)

def gmeans(x):
    m = x[x.notna()]
    return gmean(m)

def gstdss(x):
    m = x[x.notna()]
    
    try:
        return gstd(m)
    except:
        print('Chcek', m)
        return 0

contin_res = dff[contin].replace(-9, np.nan).groupby(['CohortType']).agg([gmeans,gstdss,np.nanmean,'min','max']).transpose()

#Check the this prints, then there are erros in the geometric std


Chcek 570    50.80
571    50.80
572    51.00
573    51.00
574    51.00
       ...  
700    50.80
701    48.50
702    48.50
703    50.00
704    52.75
Name: birthLen, Length: 135, dtype: float64
Chcek 0      18.943000
1      10.054318
2       8.167619
3      24.770392
4      15.013766
         ...    
565    14.562668
566     8.806000
567     6.319761
568     8.062860
569    28.981445
Name: UTAS, Length: 570, dtype: float64
Chcek Series([], Name: UAS3, dtype: float64)
Chcek Series([], Name: UAS3, dtype: float64)
Chcek Series([], Name: UAS5, dtype: float64)
Chcek Series([], Name: UAS5, dtype: float64)
Chcek Series([], Name: UMMA, dtype: float64)
Chcek Series([], Name: UMMA, dtype: float64)
Chcek Series([], Name: UDMA, dtype: float64)
Chcek Series([], Name: UDMA, dtype: float64)


  log_a = np.log(np.array(a, dtype=dtype))
  log_a = np.log(np.array(a, dtype=dtype))
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


In [19]:
three_summary = pd.pivot_table(contin_res.reset_index(), values=['DAR', 'NEU', 'UNM'], index = 'level_0',
                    columns=['level_1']).reset_index()

In [24]:
three_summary.round(4)

CohortType,level_0,DAR,DAR,DAR,DAR,DAR,NEU,NEU,NEU,NEU,NEU,UNM,UNM,UNM,UNM,UNM
level_1,Unnamed: 1_level_1,gmeans,gstdss,max,min,nanmean,gmeans,gstdss,max,min,nanmean,gmeans,gstdss,max,min,nanmean
0,BMI,25.6185,1.2399,63.2328,13.2249,26.2539,25.9798,1.2176,52.86,12.16,26.4933,43.9871,1.5657,82.9874,15.1349,48.1899
1,Outcome_weeks,38.9131,1.0498,44.71,24.14,38.9571,38.8921,1.0534,42.7143,23.2857,38.9414,37.9554,1.0592,41.9683,34.1629,38.0175
2,UAS3,0.157,4.1523,16.4,0.0071,0.4108,,0.0,,,,,0.0,,,
3,UAS5,0.0985,3.7892,37.241,0.001,0.3808,,0.0,,,,,0.0,,,
4,UDMA,2.4071,2.7534,241.07,0.0015,3.994,,0.0,,,,,0.0,,,
5,UMMA,0.2319,3.3384,28.35,0.0022,0.4206,,0.0,,,,,0.0,,,
6,UTAS,5.4736,3.5877,870.837,0.0071,14.3691,,0.0,466.0342,-488.9316,14.2591,5.4952,2.0963,35.4462,0.933,7.0717
7,age,30.4755,1.2041,2018.0,18.0,31.7319,26.5035,1.2299,40.0,18.0,27.0702,28.7499,1.3158,42.8996,17.0867,29.8177
8,birthLen,50.5536,1.0635,62.23,19.5,50.6444,50.4635,1.0646,58.42,30.1498,50.5575,0.0,0.0,56.0,0.0,48.2251
9,birthWt,3363.4579,1.2009,5400.0,357.2037,3413.5054,3138.7757,1.1988,4700.0,900.0,3185.4054,3.2236,1.1997,4.55,1.4197,3.2733


In [34]:
three_summary.to_csv('../mediafiles/table_continous.csv')

In [35]:
melted = pd.melt(dff[categ + ['CohortType']],id_vars=['CohortType'])    

melted['count'] = 1

melted = melted.replace(np.nan, -9)

df33 = melted.groupby(['CohortType','variable','value'])['count'].count()
df44 = df33.reset_index()


In [36]:
df44.to_csv('../mediafiles/table_categorical.csv')