In [None]:
# Required to access the database
import os
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

# Data analysis tools
import pandas as pd
import numpy as np
import seaborn as sns

# Models available in our application
from datasets.models import RawFlower, RawUNM, RawDAR
from django.contrib.auth.models import User
from datasets.models import RawDictionary


from datasets.models import RawNEU
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels

!pip install lxml

In [None]:
from api import adapters
from api import analysis

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
def dummy_code(df, covars_cat, contin):
    coded_covars = []
    orig_shape = df.shape[0]
    for var in covars_cat:

        df[var] = pd.Categorical(df[var])

        dummies_df = pd.get_dummies(df[var], prefix = var, drop_first=True)

        coded_covars = coded_covars + [ x for x in dummies_df.columns.tolist()]

        df = pd.concat([df, dummies_df], axis = 1)
        df.drop([var], inplace = True, axis = 1)
        
        assert df.shape[0] == orig_shape
    
    #print(coded_covars + contin)
    return df[coded_covars + contin]


In [None]:
from api import dilutionproc   

In [None]:
def merge3CohortFrames2(df1,df2,df3):
    'merge on feature intersections'

    for as_feature in ['UASB', 'UDMA', 'UAS5', 'UIAS', 'UAS3', 'UMMA']:
        if as_feature not in df1.columns:
            df1[as_feature] = np.nan
        if as_feature not in df2.columns:
            df2[as_feature] = np.nan
        if as_feature not in df3.columns:
            df3[as_feature] = np.nan

    s1 = set(df1.columns)
    s2 = set(df2.columns)
    s3 = set(df3.columns)

    cc = set.intersection(s1, s2, s3)

    df_all = pd.concat([df1[cc],df2[cc],df3[cc]])

    return df_all

In [None]:
# Get the data

## Model 1: Restricted to participants with no fish/seafood consumption.

## Get NEU data with no fish
df_NEU = adapters.neu.get_dataframe_orig()
df_NEU = df_NEU[df_NEU['TimePeriod']==2] # Visit 2

df_NEU_covars = adapters.neu.get_dataframe_covars()
df_NEU = df_NEU_covars.merge(df_NEU, on = ['PIN_Patient','CohortType','TimePeriod']) #Merge the covariates

#df_NEU = df_NEU[(df_NEU['fish_pu_v2'] == 0) & (df_NEU['fish'] == 0)] #No fish consumption

## Get DAR 
df_DAR = adapters.dar.get_dataframe()

for feature in ['LGA','Outcome','SGA']:
    df_DAR[feature] = df_DAR[feature].astype(float)
## Get UNM data with no fis
df_UNM = adapters.unm.get_dataframe_orig()
#df_UNM = df_UNM[df_UNM['fish']==0]
df_UNM_covars = adapters.unm.get_dataframe_covars()

df_UNM = df_UNM_covars.merge(df_UNM, on = ['PIN_Patient','CohortType','TimePeriod']) #Merge the covariates

df_NEU = df_NEU.replace(-9,np.nan).replace('-9', np.nan)
#df_ALL = analysis.merge3CohortFrames(df_UNM,df_NEU,df_DAR)


df_ALL = merge3CohortFrames2(df_NEU, df_UNM, df_DAR)

frames_for_adjust = [
    ('NEU', df_NEU),
    ('UNM', df_UNM),
    ('DAR', df_DAR)
]


#df_ALL = analysis.merge3CohortFrames(df_NEU, df_UNM, df_DAR)
frames_for_analysis = [
    ('NEU', df_NEU),
    ('UNM', df_UNM),
    ('DAR', df_DAR),
    ('ALL', df_ALL)
    
]

for name, df in frames_for_analysis:
    print('Data Stats')
    print(name)
    print(df.shape)

In [None]:
##Run the adjustment
keep_adj = []
for name, df_coh in frames_for_adjust:
    print('Working on ', name)

    
    #variables for fitting procedure
    x_feature = 'UTAS'
    cat_vars = ['babySex','smoking','education']
    contin_vars = ['PIN_Patient','BMI','UTAS'] 

    # dummy code
    df_coh_coded_model =  dummy_code(df_coh, cat_vars, contin_vars)

    ## variables for addjustment procedure
    adjust_cat_vars =  ['babySex','smoking','education','race']
    adjust_contin_vars = ['PIN_Patient','CohortType','BMI', 'ga_collection','birth_year','age']
    #add proper variable depending on cohort
    if name == 'NEU':

        adjust_contin_vars= adjust_contin_vars + ['SPECIFICGRAVITY_V2']

    if name == 'UNM':

        adjust_contin_vars = adjust_contin_vars + ['creatininemgdl']

    if name == 'DAR':

        adjust_contin_vars = adjust_contin_vars + ['urine_specific_gravity']

    ## adjustment procedure
    if name in ['NEU', 'UNM', 'DAR']:
        #dummy code 
        df_coh_coded_adjust_model =  dummy_code(df_coh, adjust_cat_vars, adjust_contin_vars)

        d_test = df_coh_coded_adjust_model.dropna()
    
        dil_adj = dilutionproc.predict_dilution(d_test, name)

        fin = df_coh_coded_model.merge(dil_adj[['PIN_Patient','UDR']], on = ['PIN_Patient'])

        adjs = dil_adj[['PIN_Patient','UDR']]
        adjs.loc[:,'CohortType'] = name
        print(adjs.shape)
        keep_adj.append(adjs)
        print('Done')

In [None]:
cohort_adjustments = pd.concat(keep_adj)

cohort_adjustments

In [None]:
#mkdirs
try:
    os.mkdir('rresultslme4')
except:
    print('exists')
try:
    os.mkdir('rresultslmer4')
except:
    print('exists')
try:
    os.mkdir('rresultsglm4')
except:
    print('exists')
try:
    os.mkdir('rresultsglmer4')
except:
    print('exists')

In [None]:
# dictonaries to hold data frames for analysis

frames_to_r_indv = dict()
bin_frames_to_r_indv = dict()
frames_to_r_all = dict()
bin_frames_to_r_all = dict()


#d_test = df_NEU[['PIN_Patient','CohortType','race', 'education','babySex','BMI', 'ga_collection','birth_year','age','SPECIFICGRAVITY_V2']]
#all_vars = covars + [x_feature] 
Y_features_continuous = ['Outcome_weeks','birthWt', 'headCirc', 'birthLen']
Y_features_binary    =  ['LGA','Outcome','SGA']


outputs_conf = []
outputs_crude = []


for outcome in Y_features_binary + Y_features_continuous:
    
    
    for name, df_coh in frames_for_analysis:
        print('Working on ', name)
    

        #variables for fitting procedure
        x_feature = 'UTAS'
        cat_vars = ['babySex','smoking','education']
        
        if outcome in Y_features_binary:
            contin_vars = ['PIN_Patient','BMI','UTAS','parity'] + [outcome]
        if outcome in Y_features_continuous:
            contin_vars = ['PIN_Patient','BMI','UTAS','parity'] + [outcome]
        
        # dummy code
        
        df_coh_coded_model =  dummy_code(df_coh, cat_vars, contin_vars)
        
        ## variables for addjustment procedure
        adjust_cat_vars =  ['babySex','smoking','education','race']
        adjust_contin_vars = ['PIN_Patient','CohortType','BMI', 'ga_collection','birth_year','age']
        
            
        if name in ['NEU', 'UNM', 'DAR']:

            fin = df_coh_coded_model.merge(cohort_adjustments, on = ['PIN_Patient'])
            
            if outcome in Y_features_continuous and name != 'ALL':

                fin = fin.dropna()
                frames_to_r_indv[name + '|' + outcome ] = fin
                
            if outcome in Y_features_binary and name != 'ALL':

                fin = fin.dropna()
                bin_frames_to_r_indv[name + '|' + outcome] = fin
            
            
        if name in ['ALL']:
            x = 1
            if len(keep_adj) == 3: df_adj_all = pd.concat(keep_adj)
                
            fin = df_coh_coded_model.merge(df_adj_all, on = ['PIN_Patient'])
          
            if outcome in Y_features_continuous and name == 'ALL':

                fin = fin.dropna()
                frames_to_r_all[name + '|' + outcome ] = fin
            if outcome in Y_features_binary and name == 'ALL':

                fin = fin.dropna()
                bin_frames_to_r_all[name + '|' + outcome] = fin

In [None]:
print(keep_adj)

In [None]:
frames_to_r_indv.keys()

In [None]:
bin_frames_to_r_indv.keys()

In [None]:
frames_to_r_indv['UNM|Outcome_weeks'].shape

# Start R stuff

In [None]:
%load_ext rpy2.ipython


In [None]:
# Analysis for individual data

In [None]:
data_outcome_weeks_NEU = frames_to_r_indv['NEU|Outcome_weeks']
data_birthWt_NEU  = frames_to_r_indv['NEU|birthWt']
data_headCirc_NEU = frames_to_r_indv['NEU|headCirc']
data_birthLen_NEU = frames_to_r_indv['NEU|birthLen']

%Rpush data_outcome_weeks_NEU
%Rpush data_birthWt_NEU
%Rpush data_headCirc_NEU
%Rpush data_birthLen_NEU


data_outcome_weeks_DAR = frames_to_r_indv['DAR|Outcome_weeks']
data_birthWt_DAR  = frames_to_r_indv['DAR|birthWt']
data_headCirc_DAR = frames_to_r_indv['DAR|headCirc']
data_birthLen_DAR = frames_to_r_indv['DAR|birthLen']

%Rpush data_outcome_weeks_DAR
%Rpush data_birthWt_DAR
%Rpush data_headCirc_DAR
%Rpush data_birthLen_DAR


data_outcome_weeks_UNM = frames_to_r_indv['UNM|Outcome_weeks']
data_birthWt_UNM  = frames_to_r_indv['UNM|birthWt']
data_headCirc_UNM = frames_to_r_indv['UNM|headCirc']
data_birthLen_UNM = frames_to_r_indv['UNM|birthLen']

%Rpush data_outcome_weeks_UNM
%Rpush data_birthWt_UNM
%Rpush data_headCirc_UNM
%Rpush data_birthLen_UNM

In [None]:
fit_str_outcome_weeks_indv = 'Outcome_weeks ~ parity + babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS' 

fit_str_birthWt_indv = 'birthWt ~ parity +  babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS' 

fit_str_headCirc_indv = 'headCirc ~ parity +  babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS' 

fit_str_birthLen_indv = 'birthLen ~ parity +  babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS' 


%Rpush fit_str_outcome_weeks_indv
%Rpush fit_str_birthWt_indv
%Rpush fit_str_headCirc_indv
%Rpush fit_str_birthLen_indv


In [None]:
#NEU

In [None]:
print(data_outcome_weeks_DAR.shape)

In [None]:
%%R
library(lme4)
library(lmerTest)


m<-lm(fit_str_outcome_weeks_indv, data=data_outcome_weeks_NEU)
sink("rresultslme4/NEU_cohorts_outcome_UTAS.txt")
print(fit_str_outcome_weeks_indv)
print(summary(m))
sink()


m<-lm(fit_str_birthWt_indv, data=data_birthWt_NEU)
sink("rresultslme4/NEU_cohorts_birthWt_UTAS.txt")
print(fit_str_birthWt_indv)
print(summary(m))
sink()


m<-lm(fit_str_headCirc_indv, data=data_headCirc_NEU)
sink("rresultslme4/NEU_cohorts_headCirc_UTAS.txt")
print(fit_str_headCirc_indv)
print(summary(m))
sink()


m<-lm(fit_str_birthLen_indv, data=data_birthLen_NEU)
print(fit_str_birthLen_indv)
sink("rresultslme4/NEU_cohorts_birthLen_UTAS.txt")
print(summary(m))
sink()

In [None]:
#DAR

In [None]:
%%R
library(lme4)
library(lmerTest)


m<-lm(fit_str_outcome_weeks_indv, data=data_outcome_weeks_DAR)
sink("rresultslme4/DAR_cohorts_outcome_UTAS.txt")
print(fit_str_outcome_weeks_indv)
print(summary(m))
sink()


m<-lm(fit_str_birthWt_indv, data=data_birthWt_DAR)
sink("rresultslme4/DAR_cohorts_birthWt_UTAS.txt")
print(fit_str_birthWt_indv)
print(summary(m))
sink()


m<-lm(fit_str_headCirc_indv, data=data_headCirc_DAR)
sink("rresultslme4/DAR_cohorts_headCirc_UTAS.txt")
print(fit_str_headCirc_indv)
print(summary(m))
sink()


m<-lm(fit_str_birthLen_indv, data=data_birthLen_DAR)
print(fit_str_birthLen_indv)
sink("rresultslme4/DAR_cohorts_birthLen_UTAS.txt")
print(summary(m))
sink()

In [None]:
#UNM

In [None]:
%%R
library(lme4)
library(lmerTest)


m<-lm(fit_str_outcome_weeks_indv, data=data_outcome_weeks_UNM)
sink("rresultslme4/UNM_cohorts_outcome_UTAS.txt")
print(fit_str_outcome_weeks_indv)
print(summary(m))
sink()


m<-lm(fit_str_birthWt_indv, data=data_birthWt_UNM)
sink("rresultslme4/UNM_cohorts_birthWt_UTAS.txt")
print(fit_str_birthWt_indv)
print(summary(m))
sink()


m<-lm(fit_str_headCirc_indv, data=data_headCirc_UNM)
sink("rresultslme4/UNM_cohorts_headCirc_UTAS.txt")
print(fit_str_headCirc_indv)
print(summary(m))
sink()


m<-lm(fit_str_birthLen_indv, data=data_birthLen_UNM)
print(fit_str_birthLen_indv)
sink("rresultslme4/UNM_cohorts_birthLen_UTAS.txt")
print(summary(m))
sink()

In [None]:
# Analysis for individual logistic regression results
data_Outcome_NEU = bin_frames_to_r_indv['NEU|Outcome']
data_LGA_NEU = bin_frames_to_r_indv['NEU|LGA']
data_SGA_NEU = bin_frames_to_r_indv['NEU|SGA']

data_Outcome_DAR = bin_frames_to_r_indv['DAR|Outcome']
data_LGA_DAR = bin_frames_to_r_indv['DAR|LGA']
data_SGA_DAR = bin_frames_to_r_indv['DAR|SGA']

data_Outcome_UNM = bin_frames_to_r_indv['UNM|Outcome']
data_LGA_UNM = bin_frames_to_r_indv['UNM|LGA']
data_SGA_UNM = bin_frames_to_r_indv['UNM|SGA']


%Rpush data_Outcome_NEU
%Rpush data_LGA_NEU
%Rpush data_SGA_NEU

%Rpush data_Outcome_DAR
%Rpush data_LGA_DAR
%Rpush data_SGA_DAR

%Rpush data_Outcome_UNM
%Rpush data_LGA_UNM
%Rpush data_SGA_UNM


fit_str_outcome_indv = 'Outcome ~ parity + babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS' 

fit_str_SGA_indv = 'SGA ~ parity + babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS' 

fit_str_LGA_indv = 'LGA ~ parity + babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS' 



%Rpush fit_str_outcome_indv
%Rpush fit_str_SGA_indv
%Rpush fit_str_LGA_indv



In [None]:
#NEU

In [None]:
%%R
library(lme4)
library(plyr)
library(lmerTest)


m<-glm(fit_str_outcome_indv, data=data_Outcome_NEU, family = binomial)
print(fit_str_outcome_indv)
sink("rresultsglm4/NEU_outcome_UTAS.txt")
print(summary(m))
print(paste(toString(NROW(data_Outcome_NEU)), " observations.", sep = " "))
sink()


m<-glm(fit_str_LGA_indv, data=data_LGA_NEU, family = binomial)
sink("rresultsglm4/NEU_LGA_UTAS.txt")
print(fit_str_LGA_indv)
print(summary(m))
print(paste(toString(NROW(data_LGA_NEU)), " observations.", sep = " "))
sink()


m<-glm(fit_str_SGA_indv, data=data_SGA_NEU, family = binomial)
sink("rresultsglm4/NEU_SGA_UTAS.txt")
print(fit_str_SGA_indv)
print(summary(m))
print(paste(toString(NROW(data_SGA_NEU)), " observations.", sep = " "))
sink()

In [None]:
# DB to do: 
# 1: Update log regressions for DAR and UNM
# 2: Print summary and number of observations for each var
# 3: Print lm, glm, lme, glme summaries to individual csvs (i.e. 'DAR_glme_summary.csv')

In [None]:
#UNM

In [None]:
%%R
library(lme4)
library(plyr)
library(lmerTest)


m<-glm(fit_str_outcome_indv, data=data_Outcome_UNM, family = binomial)
print(fit_str_outcome_indv)
sink("rresultsglm4/UNM_outcome_UTAS.txt")
print(summary(m))
print(paste(toString(NROW(data_Outcome_UNM)), " observations.", sep = " "))
sink()


m<-glm(fit_str_LGA_indv, data=data_LGA_UNM, family = binomial)
sink("rresultsglm4/UNM_LGA_UTAS.txt")
print(fit_str_LGA_indv)
print(summary(m))
print(paste(toString(NROW(data_LGA_UNM)), " observations.", sep = " "))
sink()


m<-glm(fit_str_SGA_indv, data=data_SGA_UNM, family = binomial)
sink("rresultsglm4/UNM_SGA_UTAS.txt")
print(fit_str_SGA_indv)
print(summary(m))
print(paste(toString(NROW(data_SGA_UNM)), " observations.", sep = " "))
sink()

In [None]:
#DAR

In [None]:
%%R
library(lme4)
library(plyr)
library(lmerTest)


m<-glm(fit_str_outcome_indv, data=data_Outcome_DAR, family = binomial)
print(fit_str_outcome_indv)
sink("rresultsglm4/DAR_outcome_UTAS.txt")
print(summary(m))
print(paste(toString(NROW(data_Outcome_DAR)), " observations.", sep = " "))
sink()


m<-glm(fit_str_LGA_indv, data=data_LGA_DAR, family = binomial)
sink("rresultsglm4/DAR_LGA_UTAS.txt")
print(fit_str_LGA_indv)
print(summary(m))
print(paste(toString(NROW(data_LGA_DAR)), " observations.", sep = " "))
sink()


m<-glm(fit_str_SGA_indv, data=data_SGA_DAR, family = binomial)
sink("rresultsglm4/DAR_SGA_UTAS.txt")
print(fit_str_SGA_indv)
print(summary(m))
print(paste(toString(NROW(data_SGA_DAR)), " observations.", sep = " "))
sink()

In [None]:
# Analysis for combined DATA

In [None]:
data_outcome_weeks = frames_to_r_all['ALL|Outcome_weeks']
data_birthWt = frames_to_r_all['ALL|birthWt']
data_headCirc = frames_to_r_all['ALL|headCirc']
data_birthLen = frames_to_r_all['ALL|birthLen']

%Rpush data_outcome_weeks
%Rpush data_birthWt
%Rpush data_headCirc
%Rpush data_birthLen

In [None]:
data_outcome_weeks.shape

In [None]:
fit_str_outcome_weeks = 'Outcome_weeks ~ parity + babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS + (1|CohortType)' 

fit_str_birthWt = 'birthWt ~ parity +  babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS + (1|CohortType)' 

fit_str_headCirc = 'headCirc ~ parity +  babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS + (1|CohortType)' 

fit_str_birthLen = 'birthLen ~ parity +  babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS + (1|CohortType)' 



%Rpush fit_str_outcome_weeks
%Rpush fit_str_birthWt
%Rpush fit_str_headCirc
%Rpush fit_str_birthLen



In [None]:
%%R
library(lme4)

library(lmerTest)
print(fit_str_outcome_weeks)
m<-lmer(fit_str_outcome_weeks, data=data_outcome_weeks)
sink('rresultslmer4/all_cohorts_outcome_weeks_UTAS.txt')
print(summary(m))
print(paste(toString(NROW(data_outcome_weeks)), " observations.", sep = " "))
sink()


In [None]:
%%R
library(lme4)

library(lmerTest)
print(fit_str_birthWt)
m<-lmer(fit_str_birthWt, data=data_birthWt)
sink("rresultslmer4/all_cohorts_birthWt_UTAS.txt")
print(summary(m))
print(paste(toString(NROW(data_birthWt)), " observations.", sep = " "))
sink()

In [None]:
%%R
library(lme4)

library(lmerTest)
print(fit_str_headCirc)
m<-lmer(fit_str_headCirc, data=data_headCirc)
sink("rresultslmer/all_cohorts_headCirc_UTAS.txt")
print(summary(m))
print(paste(toString(NROW(data_headCirc)), " observations.", sep = " "))
sink()

In [None]:
%%R
library(lme4)

library(lmerTest)
print(fit_str_birthLen)
m<-lmer(fit_str_birthLen, data=data_birthLen)
sink("rresultslmer4/all_cohorts_birthLen_UTAS.txt")
print(summary(m))
print(paste(toString(NROW(data_birthLen)), " observations.", sep = " "))
sink()

In [None]:
# binomial 

In [None]:

data_Outcome = bin_frames_to_r_all['ALL|Outcome']
data_LGA = bin_frames_to_r_all['ALL|LGA']
data_SGA = bin_frames_to_r_all['ALL|SGA']

#data_Outcome.loc[0:50,'CohortType'] = 'NEU2'

%Rpush data_Outcome
%Rpush data_LGA
%Rpush data_SGA


fit_str_outcome = 'Outcome ~parity +  babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS + (1|CohortType)' 

fit_str_SGA = 'SGA ~ parity +  babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS + (1|CohortType)' 

fit_str_LGA = 'LGA ~ parity +  babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS + (1|CohortType)' 


%Rpush fit_str_outcome
%Rpush fit_str_SGA
%Rpush fit_str_LGA

In [None]:
%%R 

library(lme4)

library(lmerTest)

print(fit_str_outcome)

m<-glmer(fit_str_outcome, data=data_Outcome, family = binomial)
sink("rresultsglmer4/all_cohorts_Outcome_UTAS.txt")
print(summary(m))
print(paste(toString(NROW(data_Outcome)), " observations.", sep = " "))
sink()



In [None]:
%%R
library(lme4)

library(lmerTest)

print(fit_str_SGA)
m<-glmer(fit_str_SGA, data=data_SGA, family = binomial)
sink("rresultsglmer4/all_cohorts_SGA_UTAS.txt")
print(summary(m))
print(paste(toString(NROW(data_SGA)), " observations.", sep = " "))
sink()

In [None]:
%%R
library(lme4)

library(lmerTest)

print(fit_str_LGA)
m<-glmer(fit_str_LGA, data=data_LGA, family = binomial)
sink("rresultsglmer4/all_cohorts_LGA_UTAS.txt")
print(summary(m))
print(paste(toString(NROW(data_LGA)), " observations.", sep = " "))
sink()