### Setup

This notebook explains how to to access the dataset models from django.
The datasets are loaded from the Postgres database into pandas dataframes.

To start the notebook server:

```
# Start a bash shell inside the api container
docker-compose exec api /bin/bash

# Start the jupyter notebook
python manage.py shell_plus --notebook

# Take note of the token provided and access the notebook through:
<ip-of-the-machine>:7777/?<token>
http://127.0.0.1:7777/?token=30c43675981e671b4a609cff470819098e274bbde415b7f5
```

This step has only to be made once as long as the jupyter-notebook keeps executing.
Considering that the the notebook continues to run, you can access the notebook using:
``` 
<ip-of-the-machine>:7777/?<token>
```

In [1]:
# Required to access the database
import os
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

# Data analysis tools
import pandas as pd
import numpy as np
import seaborn as sns

# Models available in our application
from datasets.models import RawFlower, RawUNM, RawDAR
from django.contrib.auth.models import User

In [2]:
from api import adapters
from api import analysis

In [3]:


df1 = adapters.neu.get_dataframe()
df2 = adapters.unm.get_dataframe()
df3 = adapters.dar.get_dataframe()

In [4]:
analysis.cohortdescriptive(df1,df2,df3)

['custom-analysis-sumarry-and-correlation.ipynb', 'custom-analysis-template.ipynb', 'custom-analysis-template-Copy1.ipynb', 'custom-analysis-linear-mixed.ipynb', 'Untitled.ipynb', '.ipynb_checkpoints']


Unnamed: 0,level_0_cnt,Dartmouth_cnt,NEU_cnt,UNM_cnt,Dartmouth_mean,NEU_mean,UNM_mean
0,TimePeriod,10,2200,14,1.0,1.862727,1.0
1,Outcome,10,2200,14,0.1,0.083636,0.571429


In [5]:
df = pd.DataFrame.from_records(
    RawUNM.objects.values()
)
df.columns

Index(['id', 'PIN_Patient', 'Member_c', 'TimePeriod', 'Analyte', 'Result',
       'Creat_Corr_Result', 'Outcome', 'Outcome_weeks', 'age', 'ethnicity',
       'race', 'education', 'BMI', 'income', 'smoking', 'parity',
       'preg_complications', 'folic_acid_supp', 'fish', 'babySex', 'birthWt',
       'birthLen'],
      dtype='object')

In [6]:
to_corr_cols = ['TimePeriod', 'Member_c', 'Outcome', 'Outcome_weeks', 'age',
                'ethnicity', 'race', 'BMI', 'smoking', 'parity', 'preg_complications',
                'folic_acid_supp', 'fish', 'babySex', 'birthWt', 'birthLen','UTAS','UHG']

analysis.getCorrelationHeatmap(df1, to_corr_cols)

KeyError: 'Outcome_weeks'

In [7]:
x_cols = ['fish','folic_acid_supp']
y_cols = ['UHG','UTAS']

analysis.getSpearmans(df1, x_cols, y_cols).round(3)

KeyError: "None of [Index(['fish', 'folic_acid_supp'], dtype='object', name='Analyte')] are in the [columns]"

In [None]:
to_corr_cols = ['TimePeriod', 'Member_c', 'Outcome', 'Outcome_weeks', 'age',
       'ethnicity', 'race', 'education', 'BMI', 'income', 'smoking', 'parity',
       'preg_complications', 'folic_acid_supp', 'fish', 'babySex', 'birthWt',
       'birthLen', 'UTAS', 'CohortType']

analysis.getCorrelationHeatmap(df2, to_corr_cols)

In [None]:
x_cols = ['fish','folic_acid_supp']
y_cols = ['UHG','UTAS']

analysis.getSpearmans(df2, x_cols, y_cols).round(3)

In [None]:
to_corr_cols_dar = ['Member_c', 'TimePeriod', 
                   'sample_gestage_days', 'Outcome', 
                   'age','ethnicity','race','education','BMI',
                   'smoking','parity', 'preg_complications',
                   'folic_acid_supp','babySex',
                   'birthWt','birthLen','headCirc','ponderal','PNFFQTUNA',
                   'PNFFQFR_FISH_KIDS','PNFFQSHRIMP_CKD','PNFFQDK_FISH','PNFFQOTH_FISH',
                   'mfsp_6','fish','TOTALFISH_SERV','UIAS', 'UASB', 'UAS3', 'UAS5','UHG','UAS']

analysis.getCorrelationHeatmap(df3, to_corr_cols_dar)

In [None]:
x_cols = ['PNFFQFR_FISH_KIDS','PNFFQSHRIMP_CKD','PNFFQDK_FISH','PNFFQOTH_FISH','mfsp_6','fish','TOTALFISH_SERV']
y_cols = ['UIAS', 'UASB', 'UAS3', 'UAS5','UHG','UAS']

analysis.getSpearmans(df3, x_cols, y_cols).round(3)[0:20]