### Setup

This notebook explains how to to access the dataset models from django.
The datasets are loaded from the Postgres database into pandas dataframes.

To start the notebook server:

```
# Start a bash shell inside the api container
docker-compose exec api /bin/bash

# Start the jupyter notebook
python manage.py shell_plus --notebook

# Take note of the token provided and access the notebook through:
<ip-of-the-machine>:7777/?<token>
http://127.0.0.1:7777/?token=30c43675981e671b4a609cff470819098e274bbde415b7f5
```

This step has only to be made once as long as the jupyter-notebook keeps executing.
Considering that the the notebook continues to run, you can access the notebook using:
``` 
<ip-of-the-machine>:7777/?<token>
```

In [1]:
# Required to access the database
import os
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

# Data analysis tools
import pandas as pd
import numpy as np
import seaborn as sns

# Models available in our application
from datasets.models import RawFlower, RawUNM, RawDAR
from django.contrib.auth.models import User

In [2]:
from api import adapters
from api import analysis



In [3]:

CAT_NEU_ANALYTES = [('Analytes', (
    ('USB', 'Antimony - Urine'),
    ('UTAS', 'Arsenic Total - Urine'), #modified just for poster - change back later/check if it's actually total 
    ('UBA', 'Barium - Urine'),
    ('UBE', 'Beryllium - Urine'),
    ('UCD', 'Cadmium - Urine'),
    ('UCS', 'Cesium - Urine'),
    ('UCR', 'Chromium - Urine'),
    ('UCO', 'Cobalt - Urine'),
    ('UCU', 'Copper - Urine'),
    ('UPB', 'Lead - Urine'),
    ('UMN', 'Manganese - Urine'),
    ('UHG', 'Mercury - Urine'),
    ('UMO', 'Molybdenum - Urine'),
    ('UNI', 'Nickel - Urine'),
    ('UPT', 'Platinum - Urine'),
    ('USE', 'Selenium - Urine'),
    ('UTL', 'Thallium - Urine'),
    ('USN', 'Tin - Urine'),
    ('UTU', 'Tungsten - Urine'),
    ('UUR', 'Uranium - Urine'),
    ('UVA', 'Vanadium - Urine'),
    ('UZN', 'Zinc - Urine')
    # Blood
    # ('BSB', 'Antimony - Blood'   ),
    # ('BTAS','Arsenic - Blood'    ),
    # ('BAL', 'Aluminum - Blood'   ),
    # ('BBE', 'Beryllium - Blood'  ),
    # ('BBA', 'Barium - Blood'     ),
    # ('BCD', 'Cadmium - Blood'    ),
    # ('BCS', 'Cesium - Blood'     ),
    # ('BCO', 'Cobalt - Blood'     ),
    # ('BCU', 'Copper - Blood'     ),
    # ('BCR', 'Chromium - Blood'   ),
    # ('BFE', 'Iron - Blood'       ),
    # ('BPB', 'Lead - Blood'       ),
    # ('BPB208','Lead (208) - Blood'),
    # ('BMB', 'Manganese - Blood'  ),
    # ('BHG', 'Mercury - Blood'    ),
    # ('BMO', 'Molybdenum - Blood' ),
    # ('BNI', 'Nickel - Blood'     ),
    # ('BPT', 'Platinum - Blood'   ),
    # ('BTL', 'Thallium - Blood'   ),
    # ('BTU', 'Tungsten - Blood'   ),
    # ('BUR', 'Uranium - Blood'    ),
    # ('BVA', 'Vanadium - Blood'   ),
    # ('BSE', 'Selenium - Blood'),
    # ('BSEG1124', 'Selenium+G1124 - Blood'),
    # ('BSN', 'Tin - Blood'        ),
    # ('BZN', 'Zinc - Blood'       ),
))]


CAT_DAR_ANALYTES = [('Analytes', (
    # Analyate acronym and name,                    Mapping in the dar DB
    ('UAG', ' Silver - Urine'),                     # Ag in ug/L
    ('UAL', ' Aluminium - Urine'),                  # Al in ug/L
    ('UCR',  'Chromium - Urine'),                   # Cr in ug/L
    ('UCU',  'Copper - Urine'),                     # Cu in ug/L
    ('UFE',  'Iron - Urine'),                       # Fe in ug/L
    ('UNI',  'Niquel - Urine'),                     # Ni in ug/L
    ('UVA',  'Vanadium - Urine'),                   # V in ug/L
    ('UZN',  'Zinc - Urine'),                       # Zn in ug/L
    # ('BCD',  'Cadmium - Blood'),
    # ('BHGE', 'Ethyl Mercury - Blood'),
    # ('BHGM', 'Methyl Mercury - Blood'),
    # ('BMN',  'Manganese - Blood'),
    # ('BPB',  'Lead - Blood'),
    # ('BSE',  'Selenium - Blood'),
    # ('IHG',  'Inorganic Mercury - Blood'),
    # ('THG',  'Mercury Total - Blood'),
    # ('SCU',  'Copper - Serum'),
    # ('SSE',  'Selenium - Serum'),
    # ('SZN',  'Zinc - Serum'),
    ('UAS3', 'Arsenous (III) acid - Urine'),        # As in ug/L
    # ('UAS5', 'Arsenic (V) acid - Urine'),
    ('UASB', 'Arsenobetaine - Urine'),              # AsB in ug/L
    # ('UASC', 'Arsenocholine - Urine'),
    ('UBA',  'Barium - Urine'),                     # Ba in ug/L
    ('UBE',  'Beryllium - Urine'),                  # Be in ug/L
    ('UCD',  'Cadmium - Urine'),                    # Cd in ug/L
    ('UCO',  'Cobalt - Urine'),                     # Co in ug/L
    ('UCS',  'Cesium - Urine'),                     # Cs in ug/L
    ('UDMA', 'Dimethylarsinic Acid - Urine'),       # DMA in ug/L
    ('UHG',  'Mercury - Urine'),                    # Hg in ug/L
    # ('UIO',  'Iodine - Urine'),
    ('UMMA', 'Monomethylarsinic Acid - Urine'),     # MMA in ug/L
    ('UMN',  'Manganese - Urine'),                  # Mn in ug/L
    ('UMO',  'Molybdenum - Urine'),                 # Mo in ug/L
    ('UPB',  'Lead - Urine'),                       # PB in ug/L
    # ('UPT',  'Platinum - Urine'),
    ('USB',  'Antimony - Urine'),                   # Sb in ug/L
    ('USN',  'Tin - Urine'),                        # Sn in ug/L
    ('USR',  'Strontium - Urine'),                  # Sr in ug/L
    ('UTAS', 'Arsenic Total - Urine'),              # iAs in ug/L
    ('UTL',  'Thallium - Urine'),                   # Tl in ug/L
    # ('UTMO', 'Trimethylarsine - Urine')
    ('UTU',  'Tungsten - Urine'),                   # W in ug/L
    ('UUR',  'Uranium - Urine'),                    # U in ug/L

))]

CAT_UNM_ANALYTES = [('Analytes', (
    ('BCD',  'Cadmium - Blood'),
    ('BHGE', 'Ethyl Mercury - Blood'),
    ('BHGM', 'Methyl Mercury - Blood'),
    ('BMN',  'Manganese - Blood'),
    ('BPB',  'Lead - Blood'),
    ('BSE',  'Selenium - Blood'),
    ('IHG',  'Inorganic Mercury - Blood'),
    ('THG',  'Mercury Total - Blood'),
    ('SCU',  'Copper - Serum'),
    ('SSE',  'Selenium - Serum'),
    ('SZN',  'Zinc - Serum'),
    ('UAS3', 'Arsenous (III) acid - Urine'),
    ('UAS5', 'Arsenic (V) acid - Urine'),
    ('UASB', 'Arsenobetaine - Urine'),
    ('UASC', 'Arsenocholine - Urine'),
    ('UBA',  'Barium - Urine'),
    ('UBE',  'Beryllium - Urine'),
    ('UCD',  'Cadmium - Urine'),
    ('UCO',  'Cobalt - Urine'),
    ('UCS',  'Cesium - Urine'),
    ('UDMA', 'Dimethylarsinic Acid - Urine'),
    ('UHG',  'Mercury - Urine'),
    ('UIO',  'Iodine - Urine'),
    ('UMMA', 'Monomethylarsinic Acid - Urine'),
    ('UMN',  'Manganese - Urine'),
    ('UMO',  'Molybdenum - Urine'),
    ('UPB',  'Lead - Urine'),
    ('UPT',  'Platinum - Urine'),
    ('USB',  'Antimony - Urine'),
    ('USN',  'Tin - Urine'),
    ('USR',  'Strontium - Urine'),
    ('UTAS', 'Arsenic Total - Urine'),
    ('UTL',  'Thallium - Urine'),
    ('UTMO', 'Trimethylarsine - Urine'),
    ('UTU',  'Tungsten -  Urine'),
    ('UUR',  'Uranium - Urine'),

))]


In [4]:
covar_counts = ['babySex',
'birthWt',
'Outcome_weeks',
'headCirc',
'SGA',
'Outcome',
'birthLen',
'age',
'smoking',
'BMI',
'parity',
'education']
       
bio_counts = [ 'UTAS',
'PropMMAtoiAs',
'UMMA',
'UDMA']

In [5]:
'SPECIFICGRAVITY_V2',
'urine_specific_gravity',
'creatininemgdl',

('creatininemgdl',)

In [6]:
from api import adapters
from api import analysis
import pandas as pd
import numpy as np


Male/female infants - babySex
Birth weight (g) - birthWt
Gestational age (weeks) - Outcome_weeks
Head circumference (cm) = headCirc
SGA - SGA
Preterm - Outcome
Birth length - birthLen
Maternal age - age
Smoke during pregnancy - smoking
Specific gravity SPECIFICGRAVITY_V2, urine_specigic_gravity
Creatinine - creatininemgl
Maternal BMI - BMI
Parity - parity 
Maternal level of education - educatuib
Total urinary arsenic (µg/L) - UTAS
Summation iAs + MMA + DMA (µg/L) - PropMMAtoiAs + UMMA + DMA
Inorganic arsenic (µg/L) - PropMMAtoiAs
Monomethylarsonic acid (µg/L) -UMMA
Dimethylarsinic acid (µg/L) - UDMA

In [7]:
#covar data frames
df_neu_covars = adapters.neu.get_dataframe_covars().replace(-9, np.nan).replace('-9', np.nan)

df_neu_covars = df_neu_covars[df_neu_covars['TimePeriod']==2]
df_unm_covars = adapters.unm.get_dataframe_covars().replace(-9, np.nan).replace('-9', np.nan)
df_dar_covars = adapters.dar.get_dataframe().replace(-9, np.nan).replace('-9', np.nan)

#biological data frames
df_neu_bio = adapters.neu.get_dataframe_orig().replace(-9, np.nan).replace('-9', np.nan)
df_unm_bio = adapters.unm.get_dataframe_orig().replace(-9, np.nan).replace('-9', np.nan)
df_dar_bio = adapters.dar.get_dataframe().replace(-9, np.nan).replace('-9', np.nan)


df_neu_covars.drop_duplicates()

cov_neu_cnt = df_neu_covars[covar_counts + ['SPECIFICGRAVITY_V2']].drop_duplicates().count().reset_index()

In [8]:
cov_unm_cnt = df_unm_covars[covar_counts + ['creatininemgdl']].count().reset_index()

In [9]:
cov_dar_cnt = df_dar_covars[covar_counts + ['urine_specific_gravity']].count().reset_index()

In [17]:
from functools import reduce

data_frames = [cov_neu_cnt,cov_unm_cnt, cov_dar_cnt]
df_rep1 = reduce(lambda  left,right: pd.merge(left,right,on=['index'],
                                            how='outer'), data_frames)
df_rep1.columns = ['variable','NEU','UNM','DAR']
df_rep1

Unnamed: 0,variable,NEU,UNM,DAR
0,babySex,554.0,135.0,10.0
1,birthWt,555.0,135.0,10.0
2,Outcome_weeks,570.0,135.0,10.0
3,headCirc,512.0,135.0,9.0
4,SGA,554.0,135.0,10.0
5,Outcome,570.0,135.0,10.0
6,birthLen,537.0,135.0,9.0
7,age,569.0,135.0,10.0
8,smoking,562.0,135.0,8.0
9,BMI,538.0,135.0,10.0


In [18]:
import os
os.getcwd()

df_rep1.to_csv('../mediafiles/toni_report.csv', index = False)

In [12]:
# bio data

df_neu_bio.groupby(['TimePeriod','CohortType'])['UTAS'].count().reset_index()


Unnamed: 0,TimePeriod,CohortType,UTAS
0,1,NEU,966
1,2,NEU,570
2,3,NEU,664


In [13]:
# datmouth bio data

df_dar_bio[['UTAS','UDMA','UMMA','PropMMAtoiAs']].count().reset_index()

Unnamed: 0,index,0
0,UTAS,10
1,UDMA,10
2,UMMA,10
3,PropMMAtoiAs,10


In [14]:
# counts combined of speciation

df_dar_bio[(~df_dar_bio['PropMMAtoiAs'].isna()) & (~df_dar_bio['UMMA'].isna()) & (~df_dar_bio['UDMA'].isna())].shape

(10, 198)

In [1]:
#bio data for unm - is there inorganic arsenic for UNM?
df_unm_bio[['UTAS','UDMA','UMMA']].count().reset_index()

NameError: name 'df_unm_bio' is not defined

In [None]:
df_dar_bio[['UTAS','UDMA','UMMA']].count().reset_index()