### Setup

This notebook explains how to to access the dataset models from django.
The datasets are loaded from the Postgres database into pandas dataframes.

To start the notebook server:

```
# Start a bash shell inside the api container
docker-compose exec api /bin/bash

# Start the jupyter notebook
python manage.py shell_plus --notebook

# Take note of the token provided and access the notebook through:
<ip-of-the-machine>:7777/?<token>
http://127.0.0.1:7777/?token=30c43675981e671b4a609cff470819098e274bbde415b7f5
```

This step has only to be made once as long as the jupyter-notebook keeps executing.
Considering that the the notebook continues to run, you can access the notebook using:
``` 
<ip-of-the-machine>:7777/?<token>
```

In [1]:
# Required to access the database//
import os
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

# Data analysis tools
import pandas as pd
import numpy as np
import seaborn as sns

# Models available in our application
from datasets.models import RawFlower, RawUNM, RawDAR
from django.contrib.auth.models import User

In [2]:
from api import adapters
from api import analysis

In [3]:
from api import adapters
from api import analysis
import pandas as pd
import numpy as np

## Get the data

## Get NEU data with no fish
df_NEU = adapters.neu.get_dataframe_orig()
df_NEU = df_NEU[df_NEU['TimePeriod']==2] # Visit 2

df_NEU_covars = adapters.neu.get_dataframe_covars()
df_NEU = df_NEU_covars.merge(df_NEU, on = ['PIN_Patient','CohortType','TimePeriod']) #Merge the covariates
df_NEU = df_NEU.replace(-9,np.nan).replace('-9', np.nan)
## Get DAR data
df_DAR = adapters.dar.get_dataframe()

#df_DAR = df_NEU.copy()
#df_DAR['CohortType'] = 'DAR'

## Get UNM data with no fis

df_UNM = adapters.unm.get_dataframe_orig()
df_UNM_covars = adapters.unm.get_dataframe_covars()
df_UNM = df_UNM_covars.merge(df_UNM, on = ['PIN_Patient','CohortType','TimePeriod']) #Merge the covariates


## comment these two lines if on live machine
#df_UNM = df_DAR.copy()
#df_UNM['CohortType'] = 'UNM'

#df_UNM = df_NEU.copy()



#df_neu = df_neu.replace(-9, np.nan).replace('-9', np.nan)
##df_unm = df_unm.replace(-9, np.nan).replace('-9', np.nan)
#df_dar = df_dar.replace(-9, np.nan).replace('-9', np.nan)

In [4]:
import os
os.getcwd()



'/usr/src/app/jupyter'

In [5]:
df_NEU.shape

(570, 50)

Male/female infants - babySex
Birth weight (g) - birthWt
Gestational age (weeks) - Outcome_weeks
Head circumference (cm) = headCirc
SGA - SGA
Preterm - Outcome
Birth length - birthLen
Maternal age - age
Smoke during pregnancy - smoking
Specific gravity SPECIFICGRAVITY_V2, urine_specigic_gravity
Creatinine - creatininemgl
Maternal BMI - BMI
Parity - parity 
Maternal level of education - educatuib
Total urinary arsenic (µg/L) - UTAS
Summation iAs + MMA + DMA (µg/L) - PropMMAtoiAs + UMMA + DMA
Inorganic arsenic (µg/L) - PropMMAtoiAs
Monomethylarsonic acid (µg/L) -UMMA
Dimethylarsinic acid (µg/L) - UDMA

In [36]:
req = [ 'CohortType', 'PIN_Patient',
'babySex',
'birthWt',
'Outcome_weeks',
'headCirc',
'SGA',
'LGA',
'Outcome',
'birthLen',
'age',
'smoking',
'SPECIFICGRAVITY_V2',
'urine_specific_gravity',
'creatininemgdl',
'BMI',
'parity',
'education',
'UTAS',
'UAS3',
'UAS5',
'UMMA',
'UDMA']


for col in req:
    if col not in df_NEU:
        df_NEU[col] = np.nan
        
    if col not in df_DAR:
        df_DAR[col] = np.nan
    if col not in df_UNM:
        df_UNM[col] = np.nan
        

df_neu = df_NEU[req]
df_dar = df_DAR[req]
df_unm = df_UNM[req]

In [37]:
covars = ['Outcome_weeks', 'age', 'ethnicity', 'race', 
    'BMI', 'smoking', 'parity', 'preg_complications',
    'folic_acid_supp', 'fish', 'babySex', 'birthWt', 'birthLen', 'headCirc',
    'WeightCentile','LGA','SGA','ga_collection','education', 'birth_year', 
    'SPECIFICGRAVITY_V2', 'fish_pu_v2']

In [38]:
dff = pd.concat([df_neu,df_unm,df_dar])

dff

Unnamed: 0,CohortType,PIN_Patient,babySex,birthWt,Outcome_weeks,headCirc,SGA,LGA,Outcome,birthLen,...,urine_specific_gravity,creatininemgdl,BMI,parity,education,UTAS,UAS3,UAS5,UMMA,UDMA
0,NEU,2627,1.0,3100.0,39.857143,53.34,0.0,0.0,0.0,50.800,...,,,27.240000,1.0,4.0,6.924000,,,,
1,NEU,2628,1.0,3200.0,40.714286,34.29,0.0,0.0,0.0,50.800,...,,,28.000000,1.0,2.0,4.087000,,,,
2,NEU,2632,2.0,3100.0,36.428571,33.02,0.0,0.0,1.0,52.070,...,,,30.490000,2.0,5.0,12.572000,,,,
3,NEU,2634,2.0,2100.0,37.428571,31.75,1.0,0.0,0.0,46.990,...,,,30.960000,2.0,4.0,2.222000,,,,
4,NEU,2635,2.0,3600.0,39.714286,33.02,0.0,0.0,0.0,48.895,...,,,30.770000,3.0,3.0,17.710000,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2147,DAR,a5b96520-257a-4573-989f-ee88daa2bda8,,,,,,,,,...,1.010,,20.866977,,5.0,5.595289,,,0.344520,5.059960
2148,DAR,5ef7649b-765f-4c3f-b732-0330d30aac42,,,37.000000,,,,0.0,,...,1.014,,32.024857,,1.0,3.086621,,,0.211611,2.730020
2149,DAR,b18994b4-a76a-4c6a-b15d-430d54deb2e8,,,,,,,,,...,1.022,,21.753218,,5.0,6.785385,,,0.354555,6.055593
2150,DAR,9a420c8f-cb96-413a-9bae-f4c1d4c5f11e,,,,,,,,,...,1.012,,22.530119,,4.0,8.085933,,,0.566572,7.080416


In [39]:
import pandas as pd

dff2 = dff.groupby(['CohortType']).count().transpose().reset_index()
dff2['Total'] = dff2['NEU'] + dff2['DAR'] + dff2['UNM']



In [40]:
dff2

CohortType,index,DAR,NEU,UNM,Total
0,PIN_Patient,2152,570,135,2857
1,babySex,2133,554,135,2822
2,birthWt,1930,555,135,2620
3,Outcome_weeks,2144,570,135,2849
4,headCirc,2019,512,135,2666
5,SGA,1910,554,135,2599
6,LGA,1910,554,135,2599
7,Outcome,2144,570,135,2849
8,birthLen,2022,537,135,2694
9,age,2152,569,135,2856


In [None]:
dff2.to_csv('countspervariable.csv', index = False)

In [47]:
##continous
contin = ['birthWt','Outcome_weeks','headCirc','birthLen','age','BMI',
                            'parity','SPECIFICGRAVITY_V2','urine_specific_gravity','creatininemgdl',
                           'BMI','UTAS','UAS3','UAS5','UMMA','UDMA']
##categorical
categ = ['Outcome','SGA','LGA','smoking','education']
#dff = dff.reset_index()

def q1(x):
    return x.quantile(0.25)

def q2(x):
    return x.median()

def q3(x):
    return x.quantile(0.75)


contin_res = dff.groupby(['CohortType']).agg(['count','mean','min','max',q1, q2, q3]).transpose()

contin_res

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Unnamed: 0,CohortType,DAR,NEU,UNM
babySex,count,2133.000000,554.000000,135.000000
babySex,mean,1.501172,1.545126,1.466667
babySex,min,1.000000,1.000000,1.000000
babySex,max,2.000000,2.000000,2.000000
babySex,q1,1.000000,1.000000,1.000000
...,...,...,...,...
UDMA,min,0.001517,,
UDMA,max,241.070000,,
UDMA,q1,1.320024,,
UDMA,q2,2.543094,,


In [None]:
contin_rez.to_csv('contious_statistics.csv', index = False)

In [73]:
melted = pd.melt(dff[categ + ['CohortType']],id_vars=['CohortType'])    

melted['dummy'] = 1
df33 = melted.groupby(['CohortType','variable','value'])['dummy'].count()

df33

CohortType  variable   value
DAR         LGA        0.0      1444
                       1.0       466
            Outcome    0.0      1962
                       1.0       182
            SGA        0.0      1842
                       1.0        68
            education  1.0        18
                       2.0       199
                       3.0       314
                       4.0       695
                       5.0       567
            smoking    0.0      1621
                       1.0       100
                       2.0         7
                       3.0       126
NEU         LGA        0.0       500
                       1.0        54
            Outcome    0.0       526
                       1.0        44
            SGA        0.0       505
                       1.0        49
            education  1.0        34
                       2.0       146
                       3.0       119
                       4.0       186
                       5.0        81
         

In [12]:
dff2.to_csv('../mediafiles/toni_report.csv', index = False)

In [None]:
#summation unm 
# does unm have inorganic arzsenic?

df_unm[(~df_unm['PropMMAtoiAs'].isna()) & (~df_unm['UMMA'].isna()) & (~df_unm['UDMA'].isna())].shape