In [None]:
# Required to access the database
import os
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

# Data analysis tools
import pandas as pd
import numpy as np
import seaborn as sns

# Models available in our application
from datasets.models import RawFlower, RawUNM, RawDAR
from django.contrib.auth.models import User


from datasets.models import RawNEU
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels

!pip install lxml


In [None]:
from api import adapters
from api import analysis

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
def crude_reg(df_merged, x_feature, y_feature, adjust_dilution, use_covars):
   
    ## adjust dilution
    if adjust_dilution == True:
        df_merged[x_feature] = df_merged[x_feature] / df_merged['UDR']

    if use_covars:
        data = df_merged
        data.drop(['CohortType','UDR'], inplace = True, axis = 1)

    else:
        data = df_merged[[x_feature,y_feature]]

    
    ## problem - if we are using z_score to predict might be an issue
    data['intercept'] = 1
    #X and Y features TODO: clean up
    X = data[[x for x in data.columns if x !=y_feature and x!= 'PIN_Patient']]
    
    #print(X.info())
    Y = data[y_feature]
    
    X[x_feature]= np.log(X[x_feature])
    
    if df_merged.shape[0] > 2:

        reg = sm.OLS(Y, X).fit() 
        ret = reg.summary()
    else:
        ret = 'error'

    # model string
    fit_string = y_feature + '~'
    for x in X.columns:
        if x == x_feature:
            fit_string += ' + log(' + str(x) +')'
        else:
            fit_string += ' + ' + str(x)
    

    #htmls = header + ret.tables[0].as_html() + ret.tables[1].as_html() 
    df = pd.read_html(ret.tables[1].as_html(),header=0,index_col=0)[0]

    return df

def crude_logreg(df_merged, x_feature, y_feature, adjust_dilution, use_covars):
   
    ## adjust dilution
    if adjust_dilution == True:
        df_merged[x_feature] = df_merged[x_feature] / df_merged['UDR']

    if use_covars:
        data = df_merged
        data.drop(['CohortType','UDR'], inplace = True, axis = 1)

    else:
        data = df_merged[[x_feature,y_feature]]

    
    ## problem - if we are using z_score to predict might be an issue
    
    data['intercept'] = 1
    #X and Y features TODO: clean up
    X = data[[x for x in data.columns if x !=y_feature and x!= 'PIN_Patient']]
    
    
    Y = data[y_feature]
    
    X[x_feature]= np.log(X[x_feature])

    # fit the model
    print('columns going into logreg')
    print(X.columns)
    if df_merged.shape[0] > 1:
        log_reg =sm.GLM(Y, X, family=sm.families.Binomial()).fit()

        ret = log_reg.summary()
    else:
        ret = 'error'
        
        
    # model string
    fit_string = y_feature + '~'
    for x in X.columns:
        if x == x_feature:
            fit_string += ' + log(' + str(x) +')'
        else:
            fit_string += ' + ' + str(x)
    
    df = pd.read_html(ret.tables[1].as_html(),header=0,index_col=0)[0]     

    return df

In [None]:
def dummy_code(df, covars_cat, contin):
    coded_covars = []
    orig_shape = df.shape[0]
    for var in covars_cat:

        df[var] = pd.Categorical(df[var])


        dummies_df = pd.get_dummies(df[var], prefix = var, drop_first=True)

        coded_covars = coded_covars + [ x for x in dummies_df.columns.tolist()]

        df = pd.concat([df, dummies_df], axis = 1)
        df.drop([var], inplace = True, axis = 1)
        
        assert df.shape[0] == orig_shape
    
    #print(coded_covars + contin)
    return df[coded_covars + contin]


In [None]:
from api import dilutionproc

In [None]:
# Get the data

## Model 1: Restricted to participants with no fish/seafood consumption.

## Get NEU data with no fish
df_NEU = adapters.neu.get_dataframe_orig()
df_NEU = df_NEU[df_NEU['TimePeriod']==2] # Visit 2

df_NEU_covars = adapters.neu.get_dataframe_covars()
df_NEU = df_NEU_covars.merge(df_NEU, on = ['PIN_Patient','CohortType','TimePeriod']) #Merge the covariates
df_NEU['race'] = df_NEU['race'].replace(999, np.nan).replace(888,np.nan).replace(97, np.nan)
#df_NEU = df_NEU[(df_NEU['fish_pu_v2'] == 0) & (df_NEU['fish'] == 0)] #No fish consumption

## Get DAR data with no fish
df_DAR = adapters.dar.get_dataframe()

for feature in ['LGA','Outcome','SGA']:
    df_DAR[feature] = df_DAR[feature].astype(float)
## Get UNM data with no fis
df_UNM = adapters.unm.get_dataframe_orig()
#df_UNM = df_UNM[df_UNM['fish']==0]
df_UNM_covars = adapters.unm.get_dataframe_covars()



# missing creatinine around 130 entries
df_UNM_covars = df_UNM_covars[~df_UNM_covars['creatininemgdl'].isna()].drop_duplicates()
df_UNM = df_UNM_covars.merge(df_UNM, on = ['PIN_Patient','CohortType','TimePeriod']) #Merge the covariates

df_NEU = df_NEU.replace(-9,np.nan).replace('-9', np.nan)
#df_ALL = analysis.merge3CohortFrames(df_UNM,df_NEU,df_DAR)
df_ALL = df_NEU

# NEU works


frames_for_adjust = [
    ('NEU', df_NEU),
    ('UNM', df_UNM),
    ('DAR', df_DAR)
]


df_ALL = analysis.merge3CohortFrames(df_NEU, df_UNM, df_DAR)
frames_for_analysis = [
    ('NEU', df_NEU),
    ('UNM', df_UNM),
    ('DAR', df_DAR),
    ('ALL', df_ALL)
    
]

for name, df in frames_for_analysis:
    print('Data Stats')
    print(name)
    print(df.shape)


# Get the data

## Model 1: Restricted to participants with no fish/seafood consumption.

## Get NEU data with no fish
df_NEU = adapters.neu.get_dataframe_orig()
df_NEU = df_NEU[df_NEU['TimePeriod']==2] # Visit 2

df_NEU_covars = adapters.neu.get_dataframe_covars()
df_NEU = df_NEU_covars.merge(df_NEU, on = ['PIN_Patient','CohortType','TimePeriod']) #Merge the covariates

df_NEU = df_NEU[(df_NEU['fish_pu_v2'] == 0) & (df_NEU['fish'] == 0)] #No fish consumption

## Get DAR data with no fish
df_DAR = adapters.dar.get_dataframe_nofish()
## Get UNM data with no fis
df_UNM = adapters.unm.get_dataframe_orig()
df_UNM_covars = adapters.unm.get_dataframe_covars()
df_UNM_covars = df_UNM_covars[df_UNM_covars['fish']==0]

# missing creatinine around 130 entries
df_UNM_covars = df_UNM_covars[~df_UNM_covars['creatininemgdl'].isna()].drop_duplicates()
df_UNM = df_UNM_covars.merge(df_UNM, on = ['PIN_Patient','CohortType','TimePeriod']) #Merge the covariates

df_NEU = df_NEU.replace(-9,np.nan).replace('-9', np.nan)
#df_ALL = analysis.merge3CohortFrames(df_UNM,df_NEU,df_DAR)
df_ALL = df_NEU

# NEU works


frames_for_adjust = [
    ('NEU', df_NEU),
    ('UNM', df_UNM),
    ('DAR', df_DAR)
]


df_ALL = analysis.merge3CohortFrames(df_NEU, df_UNM, df_DAR)
frames_for_analysis = [
    ('NEU', df_NEU),
    ('UNM', df_UNM),
    ('DAR', df_DAR),
    ('ALL', df_ALL)
    
]

for name, df in frames_for_analysis:
    print('Data Stats')
    print(name)
    print(df.shape)



In [None]:
df_UNM.columns

In [None]:
#df_ALL['race'].value_counts()

In [None]:

#df_DAR['race'].replace('6.0','1.0').value_counts()

In [None]:
df_UNM_covars[~df_UNM_covars['creatininemgdl'].isna()].drop_duplicates().shape

In [None]:
df_UNM['creatininemgdl'] = df_UNM['creatininemgdl'].astype(float)

In [None]:
##Run the adjustment
keep_adj = []
for name, df_coh in frames_for_adjust:
    print('Working on ', name)

    
    #variables for fitting procedure
    x_feature = 'UTAS'
    cat_vars = ['babySex','smoking','education']
    contin_vars = ['PIN_Patient','BMI','UTAS'] 

    # dummy code
    df_coh_coded_model =  dummy_code(df_coh, cat_vars, contin_vars)

    ## variables for addjustment procedure
    adjust_cat_vars =  ['babySex','smoking','education','race']
    adjust_contin_vars = ['PIN_Patient','CohortType','BMI', 'ga_collection','birth_year','age']
    #add proper variable depending on cohort
    if name == 'NEU':

        adjust_contin_vars= adjust_contin_vars + ['SPECIFICGRAVITY_V2']

    if name == 'UNM':

        adjust_contin_vars = adjust_contin_vars + ['creatininemgdl']

    if name == 'DAR':

        adjust_contin_vars = adjust_contin_vars + ['urine_specific_gravity']

    ## adjustment procedure
    if name in ['NEU', 'UNM', 'DAR']:
        #dummy code 
        df_coh_coded_adjust_model =  dummy_code(df_coh, adjust_cat_vars, adjust_contin_vars)

        d_test = df_coh_coded_adjust_model.dropna()
    
        dil_adj = dilutionproc.predict_dilution(d_test, name)

        fin = df_coh_coded_model.merge(dil_adj[['PIN_Patient','UDR']], on = ['PIN_Patient'])

        adjs = dil_adj[['PIN_Patient','UDR']]
        adjs.loc[:,'CohortType'] = name
        print(adjs.shape)
        keep_adj.append(adjs)
        print('Done')

In [None]:
cohort_adjustments = pd.concat(keep_adj)



In [None]:

'''
    
    ('UNM', df_UNM),
    ('DAR', df_DAR),
    ('NEUUNM', df_NEUUNM),
    ('NEUDAR', df_NEUDAR),
    ('UNMDAR', df_UNMDAR),
    ('UNMDARNEU', df_merged_3),
]
'''


#d_test = df_NEU[['PIN_Patient','CohortType','race', 'education','babySex','BMI', 'ga_collection','birth_year','age','SPECIFICGRAVITY_V2']]
#all_vars = covars + [x_feature] 
Y_features_continuous = ['Outcome_weeks','birthWt', 'headCirc', 'birthLen']
Y_features_binary    =  ['LGA','Outcome','SGA']



outputs_conf = []
outputs_crude = []

bin_frames_to_r = dict()
frames_to_r = dict()

for outcome in Y_features_binary + Y_features_continuous:
    
    
    for name, df_coh in frames_for_analysis:
        print('Working on ', name)
    

        #variables for fitting procedure
        x_feature = 'UTAS'
        cat_vars = ['babySex','smoking','education']
        
        if outcome in Y_features_binary:
            contin_vars = ['PIN_Patient','BMI','UTAS','parity'] + [outcome]
        if outcome in Y_features_continuous:
            contin_vars = ['PIN_Patient','BMI','UTAS','parity'] + [outcome]
        
        # dummy code
        
        df_coh_coded_model =  dummy_code(df_coh, cat_vars, contin_vars)
        
        ## variables for addjustment procedure
        adjust_cat_vars =  ['babySex','smoking','education','race']
        adjust_contin_vars = ['PIN_Patient','CohortType','BMI', 'ga_collection','birth_year','age']
        
            
        #if name in ['NEU', 'UNM', 'DAR']:
        #    #dummy code 
        #    print("go")
            
        #    fin = df_coh_coded_model.merge(cohort_adjustments, on = ['PIN_Patient'])
            
        #    print(fin.columns)
        #    #sdf
            
        if name in ['ALL']:
            x = 1
            if len(keep_adj) == 1: df_adj_all = pd.concat(keep_adj)
                
            fin = df_coh_coded_model.merge(cohort_adjustments, on = ['PIN_Patient'])
            
        # run models:

        if outcome in Y_features_continuous:
            
            fin = fin.dropna()

            frames_to_r[outcome] = fin
            
            
        if outcome in Y_features_binary:
            print(name)
            print(outcome)
            
            if outcome == 'Outcome' and name == 'UNM':
                fin.drop(['parity'], axis = 1, inplace=True)
            
            fin = fin.dropna()
            
            bin_frames_to_r[outcome] = fin
            
            

            



In [None]:
fin['CohortType'].unique()

# R code

In [None]:
%load_ext rpy2.ipython

In [None]:
os.mkdir("rresultslme4")

In [None]:
data_outcome_weeks = frames_to_r['Outcome_weeks']
data_birthWt = frames_to_r['birthWt']
data_headCirc = frames_to_r['headCirc']
data_birthLen = frames_to_r['birthLen']

%Rpush data_outcome_weeks
%Rpush data_birthWt
%Rpush data_headCirc
%Rpush data_birthLen

In [None]:
data_outcome_weeks.shape

In [None]:
fit_str_outcome_weeks = 'Outcome_weeks ~ parity + babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS + (1|CohortType)' 

fit_str_birthWt = 'birthWt ~ parity +  babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS + (1|CohortType)' 

fit_str_headCirc = 'headCirc ~ parity +  babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS + (1|CohortType)' 

fit_str_birthLen = 'birthLen ~ parity +  babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS + (1|CohortType)' 



%Rpush fit_str_outcome_weeks
%Rpush fit_str_birthWt
%Rpush fit_str_headCirc
%Rpush fit_str_birthLen


In [None]:
%%R
library(lme4)

library(lmerTest)
print(fit_str_outcome_weeks)
m<-lmer(fit_str_outcome_weeks, data=data_outcome_weeks)
sink("rresultslme4/all_cohorts_outcome_UTAS.txt")
print(summary(m))
sink()

In [None]:
%%R
library(lme4)

library(lmerTest)
print(fit_str_birthWt)
m<-lmer(fit_str_birthWt, data=data_birthWt)
sink("rresultslme4/all_cohorts_birthWt_UTAS.txt")
print(summary(m))
sink()

In [None]:
%%R
library(lme4)

library(lmerTest)
print(fit_str_headCirc)
m<-lmer(fit_str_headCirc, data=data_headCirc)
sink("rresultslme4/all_cohorts_headCirc_UTAS.txt")
print(summary(m))
sink()

In [None]:
%%R
library(lme4)

library(lmerTest)
print(fit_str_birthLen)
m<-lmer(fit_str_birthLen, data=data_birthLen)
sink("rresultslme4/all_cohorts_birthLen_UTAS.txt")
print(summary(m))
sink()

In [None]:
# binomial 

In [None]:
data_Outcome = bin_frames_to_r['Outcome']
data_LGA = bin_frames_to_r['LGA']
data_SGA = bin_frames_to_r['SGA']

#data_Outcome.loc[0:50,'CohortType'] = 'NEU2'

%Rpush data_Outcome
%Rpush data_LGA
%Rpush data_SGA

In [None]:
fit_str_outcome = 'Outcome ~parity +  babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS + (1|CohortType)' 

fit_str_SGA = 'SGA ~ parity +  babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS + (1|CohortType)' 

fit_str_LGA = 'LGA ~ parity +  babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + BMI + UTAS + (1|CohortType)' 



%Rpush fit_str_outcome
%Rpush fit_str_SGA
%Rpush fit_str_LGA

In [None]:
%%R

library(lme4)

library(lmerTest)

print(fit_str_outcome)

m<-glmer(fit_str_outcome, data=data_Outcome, family = binomial)
sink("rresultslme4/all_cohorts_Outcome_UTAS.txt")
print(summary(m))
sink()


In [None]:
%%R
library(lme4)

library(lmerTest)

print(fit_str_SGA)
m<-glmer(fit_str_SGA, data=data_SGA, family = binomial)
sink("rresultslme4/all_cohorts_SGA_UTAS.txt")
print(summary(m))
sink()

In [None]:
%%R
library(lme4)

library(lmerTest)

print(fit_str_LGA)
m<-glmer(fit_str_LGA, data=data_LGA, family = binomial)
sink("rresultslme4/all_cohorts_LGA_UTAS.txt")
print(summary(m))
sink()

In [None]:
# quick histogram


#df_ALL