In [1]:
# Required to access the database
import os
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

# Data analysis tools
import pandas as pd
import numpy as np
import seaborn as sns

# Models available in our application
from datasets.models import RawFlower, RawUNM, RawDAR
from django.contrib.auth.models import User
from datasets.models import RawDictionary


from datasets.models import RawNEU
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels

!pip install lxml


Defaulting to user installation because normal site-packages is not writeable


In [2]:
from api import adapters
from api import analysis

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
def crude_mixedML(df_merged, x_feature, y_feature, adjust_dilution, use_covars):

    #TODO: Replace covars variable with actual selection of indivdual features

    if adjust_dilution == True:
        df_merged[x_feature] = df_merged[x_feature] / df_merged['UDR']

    if use_covars:
        data = df_merged
        #data.drop(['CohortType'], inplace = True, axis = 1)

    else:
        data = df_merged[[x_feature,y_feature]]

    
    ## problem - if we are using z_score to predict might be an issue
    

    data['intercept'] = 1
    #X and Y features TODO: clean up
    X = data[[x for x in data.columns if x !=y_feature and x!= 'PIN_Patient' and x!='CohortType']]
    
    #print(X.info())
    Y = data[y_feature]
    
    X[x_feature]= np.log(X[x_feature])

    if X.shape[0] > 2:
        reg = sm.MixedLM(Y, X, groups=data["CohortType"], exog_re=X["intercept"]).fit()
        ret = reg.summary()
    else:
        ret = 'error'

    fit_string = y_feature + '~'
    
    for x in X.columns:
        fit_string += ' + ' + str(x)
    
    fit_string = fit_string.replace('~ +','~') + ' + (1|CohortType)'
    header = '<div> <b> Liear Mixed Model with Random Intercept </b> </div>'
    header += '<div> <b> Number samples: </b> ' + str(X.shape[0]) + '</div>'
    header += '<div> <b>  Model: </b>' + fit_string + '</div>'
    header += '<div> <b> Group: </b> CohortType '

    htmls = header + ret.tables[0].to_html() + ret.tables[1].to_html()      
    return ret




def crude_reg(df_merged, x_feature, y_feature, adjust_dilution, use_covars):
   
    ## adjust dilution
    if adjust_dilution == True:
        df_merged[x_feature] = df_merged[x_feature] / df_merged['UDR']

    if use_covars:
        data = df_merged
        data.drop(['CohortType'], inplace = True, axis = 1)

    else:
        data = df_merged[[x_feature,y_feature]]

    
    ## problem - if we are using z_score to predict might be an issue
    
    
    

    data['intercept'] = 1
    #X and Y features TODO: clean up
    X = data[[x for x in data.columns if x !=y_feature and x!= 'PIN_Patient']]
    
    #print(X.info())
    Y = data[y_feature]
    
    X[x_feature]= np.log(X[x_feature])
    
    if df_merged.shape[0] > 2:

        reg = sm.OLS(Y, X).fit() 
        ret = reg.summary()
    else:
        ret = 'error'

    # model string
    fit_string = y_feature + '~'
    for x in X.columns:
        if x == x_feature:
            fit_string += ' + log(' + str(x) +')'
        else:
            fit_string += ' + ' + str(x)
    

    #htmls = header + ret.tables[0].as_html() + ret.tables[1].as_html() 
    df = pd.read_html(ret.tables[1].as_html(),header=0,index_col=0)[0]

    return df

def crude_logreg(df_merged, x_feature, y_feature, adjust_dilution, use_covars):
   
    ## adjust dilution
    if adjust_dilution == True:
        df_merged[x_feature] = df_merged[x_feature] / df_merged['UDR']

    if use_covars:
        data = df_merged
        data.drop(['CohortType'], inplace = True, axis = 1)

    else:
        data = df_merged[[x_feature,y_feature]]

    
    ## problem - if we are using z_score to predict might be an issue
    
    data['intercept'] = 1
    #X and Y features TODO: clean up
    X = data[[x for x in data.columns if x !=y_feature and x!= 'PIN_Patient']]
    
    
    Y = data[y_feature]
    
    X[x_feature]= np.log(X[x_feature])

    # fit the model
    print('columns going into logreg')
    print(X.columns)
    if df_merged.shape[0] > 1:
        log_reg = sm.Logit(Y, X).fit()
        ret = log_reg.summary()
    else:
        ret = 'error'
        
        
    # model string
    fit_string = y_feature + '~'
    for x in X.columns:
        if x == x_feature:
            fit_string += ' + log(' + str(x) +')'
        else:
            fit_string += ' + ' + str(x)
    
    df = pd.read_html(ret.tables[1].as_html(),header=0,index_col=0)[0]     

    return df

In [4]:
def dummy_code(df, covars_cat, contin):
    coded_covars = []
    orig_shape = df.shape[0]
    for var in covars_cat:

        df[var] = pd.Categorical(df[var])

        dummies_df = pd.get_dummies(df[var], prefix = var, drop_first=True)

        coded_covars = coded_covars + [ x for x in dummies_df.columns.tolist()]

        df = pd.concat([df, dummies_df], axis = 1)
        df.drop([var], inplace = True, axis = 1)
        
        assert df.shape[0] == orig_shape
    
    #print(coded_covars + contin)
    return df[coded_covars + contin]


In [5]:
from api import dilutionproc   


def printsummary(df):
    
    x = 1
    # spearate the data into cat and continuous summary:

In [6]:
def merge3CohortFrames2(df1,df2,df3):
    'merge on feature intersections'

    for as_feature in ['UASB', 'UDMA', 'UAS5', 'UIAS', 'UAS3', 'UMMA']:
        if as_feature not in df1.columns:
            df1[as_feature] = np.nan
        if as_feature not in df2.columns:
            df2[as_feature] = np.nan
        if as_feature not in df3.columns:
            df3[as_feature] = np.nan

    s1 = set(df1.columns)
    s2 = set(df2.columns)
    s3 = set(df3.columns)

    cc = set.intersection(s1, s2, s3)

    df_all = pd.concat([df1[cc],df2[cc],df3[cc]])

    return df_all

In [7]:
# Get the data

## Model 1: Restricted to participants with no fish/seafood consumption.

## Get NEU data with no fish
df_NEU = adapters.neu.get_dataframe_orig()
df_NEU = df_NEU[df_NEU['TimePeriod']==2] # Visit 2

df_NEU_covars = adapters.neu.get_dataframe_covars()
df_NEU = df_NEU_covars.merge(df_NEU, on = ['PIN_Patient','CohortType','TimePeriod']) #Merge the covariates

df_NEU = df_NEU[(df_NEU['fish_pu_v2'] == 0) & (df_NEU['fish'] == 0)] #No fish consumption

## Get DAR data with no fish
df_DAR = adapters.dar.get_dataframe_nofish()
## Get UNM data with no fis
df_UNM = adapters.unm.get_dataframe_orig()
#df_UNM = df_UNM[df_UNM['fish']==0]
df_UNM_covars = adapters.unm.get_dataframe_covars()

df_UNM = df_UNM_covars.merge(df_UNM, on = ['PIN_Patient','CohortType','TimePeriod']) #Merge the covariates

df_NEU = df_NEU.replace(-9,np.nan).replace('-9', np.nan)
#df_ALL = analysis.merge3CohortFrames(df_UNM,df_NEU,df_DAR)


df_ALL = merge3CohortFrames2(df_NEU, df_UNM, df_DAR)

frames_for_adjust = [
    ('NEU', df_NEU)
    
]

frames_for_analysis = [
    ('NEU', df_NEU),
    ('ALL', df_ALL)
    
]

for name, df in frames_for_analysis:
    print('Data Stats')
    print(name)
    print(df.shape)

Data Stats
NEU
(379, 55)
Data Stats
ALL
(514, 33)


In [8]:
##Run the adjustment

for name, df_coh in frames_for_adjust:
    print('Working on ', name)

    keep_adj = []
    #variables for fitting procedure
    x_feature = 'UTAS'
    cat_vars = ['babySex','smoking','education','race']
    contin_vars = ['PIN_Patient','BMI','UTAS'] 

    # dummy code
    df_coh_coded_model =  dummy_code(df_coh, cat_vars, contin_vars)

    ## variables for addjustment procedure
    adjust_cat_vars =  ['babySex','smoking','education','race']
    adjust_contin_vars = ['PIN_Patient','CohortType','BMI', 'ga_collection','birth_year','age']
    #add proper variable depending on cohort
    if name == 'NEU':

        adjust_contin_vars= adjust_contin_vars + ['SPECIFICGRAVITY_V2']

    if name == 'UNM':

        adjust_contin_vars = adjust_contin_vars + ['cratininemgl']

    if name == 'DAR':

        adjust_contin_vars = adjust_contin_vars + ['darvar']

    ## adjustment procedure
    if name in ['NEU', 'UNM', 'NEU']:
        #dummy code 
        df_coh_coded_adjust_model =  dummy_code(df_coh, adjust_cat_vars, adjust_contin_vars)

        d_test = df_coh_coded_adjust_model.dropna()

        dil_adj = dilutionproc.predict_dilution(d_test, 'NEU')

        fin = df_coh_coded_model.merge(dil_adj[['PIN_Patient','UDR']], on = ['PIN_Patient'])

        adjs = dil_adj[['PIN_Patient','UDR']]
        adjs.loc[:,'CohortType'] = name

        keep_adj.append(adjs)
        print('Done')

Working on  NEU
Model out 358. afterocnf 358. check ids 358
Done


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [9]:
cohort_adjustmets = pd.concat(keep_adj)

cohort_adjustmets

Unnamed: 0,PIN_Patient,UDR,CohortType
0,1001,0.994136,NEU
1,1006,1.479447,NEU
2,1014,1.109253,NEU
3,1015,1.28224,NEU
4,1021,0.916278,NEU
5,1025,0.953151,NEU
6,1026,1.164949,NEU
7,1034,0.891876,NEU
8,1048,0.930527,NEU
9,1059,1.318351,NEU


In [10]:
frames_to_r = dict()
bin_frames_to_r = dict()

'''
    
    ('UNM', df_UNM),
    ('DAR', df_DAR),
    ('NEUUNM', df_NEUUNM),
    ('NEUDAR', df_NEUDAR),
    ('UNMDAR', df_UNMDAR),
    ('UNMDARNEU', df_merged_3),
]
'''
#d_test = df_NEU[['PIN_Patient','CohortType','race', 'education','babySex','BMI', 'ga_collection','birth_year','age','SPECIFICGRAVITY_V2']]
#all_vars = covars + [x_feature] 
Y_features_continuous = ['Outcome_weeks','birthWt', 'headCirc', 'birthLen']
Y_features_binary    =  ['LGA','SGA','Outcome']




outputs_conf = []
outputs_crude = []

for outcome in Y_features_binary + Y_features_continuous:
    
    
    for name, df_coh in frames_for_analysis:
        print('Working on ', name)
    
        
        #variables for fitting procedure
        x_feature = 'UTAS'
        cat_vars = ['babySex','smoking','education','race']
        contin_vars = ['PIN_Patient','BMI','UTAS'] + [outcome]
        
        # dummy code
        df_coh_coded_model =  dummy_code(df_coh, cat_vars, contin_vars)
        
        ## variables for addjustment procedure
        adjust_cat_vars =  ['babySex','smoking','education','race']
        adjust_contin_vars = ['PIN_Patient','CohortType','BMI', 'ga_collection','birth_year','age']
        
            
        #if name in ['NEU', 'UNM', 'NEU']:
            #dummy code 
        #    print("go")
            
        #    fin = df_coh_coded_model.merge(cohort_adjustmets, on = ['PIN_Patient'])
            
        #    print(fin.columns)
            #sdf
        #if all, then we need an adjustmennt to mixed model
        if name in ['ALL']:
            x = 1
            if len(keep_adj) == 1: df_adj_all = pd.concat(keep_adj)
                
            fin = df_coh_coded_model.merge(df_adj_all, on = ['PIN_Patient'])
            print(fin.columns)
            
        # run models:

        if outcome in Y_features_continuous and name == 'ALL':
            
            fin = fin.dropna()
            
            
            frames_to_r[outcome] = fin
        if outcome in Y_features_binary and name == 'ALL':
            
            fin = fin.dropna()
            
            bin_frames_to_r[outcome] = fin
            







Working on  NEU
Working on  ALL
Index(['babySex_2.0', 'smoking_1.0', 'smoking_3.0', 'education_2.0',
       'education_3.0', 'education_4.0', 'education_5.0', 'race_2.0',
       'race_3.0', 'race_6.0', 'race_97.0', 'race_888.0', 'race_999.0',
       'PIN_Patient', 'BMI', 'UTAS', 'LGA', 'UDR', 'CohortType'],
      dtype='object')
Working on  NEU
Working on  ALL
Index(['babySex_2.0', 'smoking_1.0', 'smoking_3.0', 'education_2.0',
       'education_3.0', 'education_4.0', 'education_5.0', 'race_2.0',
       'race_3.0', 'race_6.0', 'race_97.0', 'race_888.0', 'race_999.0',
       'PIN_Patient', 'BMI', 'UTAS', 'SGA', 'UDR', 'CohortType'],
      dtype='object')
Working on  NEU
Working on  ALL
Index(['babySex_2.0', 'smoking_1.0', 'smoking_3.0', 'education_2.0',
       'education_3.0', 'education_4.0', 'education_5.0', 'race_2.0',
       'race_3.0', 'race_6.0', 'race_97.0', 'race_888.0', 'race_999.0',
       'PIN_Patient', 'BMI', 'UTAS', 'Outcome', 'UDR', 'CohortType'],
      dtype='object')
Wor

In [11]:
frames_to_r.keys()

dict_keys(['Outcome_weeks', 'birthWt', 'headCirc', 'birthLen'])

In [12]:
bin_frames_to_r.keys()

dict_keys(['LGA', 'SGA', 'Outcome'])

# Start R stuff

In [14]:
%load_ext rpy2.ipython


In [15]:
data_outcome_weeks = frames_to_r['Outcome_weeks']
data_birthWt = frames_to_r['birthWt']
data_headCirc = frames_to_r['headCirc']
data_birthLen = frames_to_r['birthLen']

%Rpush data_outcome_weeks
%Rpush data_birthWt
%Rpush data_headCirc
%Rpush data_birthLen

In [16]:
data_outcome_weeks.shape

(358, 19)

In [17]:
fit_str_outcome_weeks = 'Outcome_weeks ~ babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + race_2.0 + race_3.0 + race_6.0 + race_97.0 + \
race_888.0 + race_999.0 + BMI + UTAS + (1|CohortType)' 

fit_str_birthWt = 'birthWt ~ babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + race_2.0 + race_3.0 + race_6.0 + race_97.0 + \
race_888.0 + race_999.0 + BMI + UTAS + (1|CohortType)' 

fit_str_headCirc = 'headCirc ~ babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + race_2.0 + race_3.0 + race_6.0 + race_97.0 + \
race_888.0 + race_999.0 + BMI + UTAS + (1|CohortType)' 

fit_str_birthLen = 'birthLen ~ babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + race_2.0 + race_3.0 + race_6.0 + race_97.0 + \
race_888.0 + race_999.0 + BMI + UTAS + (1|CohortType)' 



%Rpush fit_str_outcome_weeks
%Rpush fit_str_birthWt
%Rpush fit_str_headCirc
%Rpush fit_str_birthLen



In [19]:
%%R
library(lme4)

library(lmerTest)
print(fit_str_outcome_weeks)
m<-lmer(fit_str_outcome_weeks, data=data_outcome_weeks)
back = print(summary(m))


R[write to console]: Loading required package: Matrix

R[write to console]: 
Attaching package: ‘lmerTest’


R[write to console]: The following object is masked from ‘package:lme4’:

    lmer


R[write to console]: The following object is masked from ‘package:stats’:

    step




[1] "Outcome_weeks ~ babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + education_5.0 + race_2.0 + race_3.0 + race_6.0 + race_97.0 + race_888.0 + race_999.0 + BMI + UTAS + (1|CohortType)"


R[write to console]: Error: grouping factors must have > 1 sampled level

R[write to console]: In addition: 

R[write to console]: In (function (package, help, pos = 2, lib.loc = NULL, character.only = FALSE,  :
R[write to console]: 
 
R[write to console]:  libraries ‘/usr/local/lib/R/site-library’, ‘/usr/lib/R/site-library’ contain no packages




Error: grouping factors must have > 1 sampled level


RInterpreterError: Failed to parse and evaluate line 'library(lme4)\n\nlibrary(lmerTest)\nprint(fit_str_outcome_weeks)\nm<-lmer(fit_str_outcome_weeks, data=data_outcome_weeks)\nback = print(summary(m))\n'.
R error message: 'Error: grouping factors must have > 1 sampled level'

In [None]:
%%R
library(lme4)

library(lmerTest)
print(fit_str_birthWt)
m<-lmer(fit_str_birthWt, data=data_birthWt)
sink("all_cohorts_birthWt_UTAS.txt")
print(summary(m))
sink()

In [None]:
%%R
library(lme4)

library(lmerTest)
print(fit_str_headCirc)
m<-lmer(fit_str_headCirc, data=data_headCirc)
sink("all_cohorts_headCirc_UTAS.txt")
print(summary(m))
sink()

In [None]:
%%R
library(lme4)

library(lmerTest)
print(fit_str_birthLen)
m<-lmer(fit_str_birthLen, data=data_birthLen)
sink("all_cohorts_birthLen_UTAS.txt")
print(summary(m))
sink()

In [None]:
# binomial 

In [29]:

data_Outcome = bin_frames_to_r['Outcome']
data_LGA = bin_frames_to_r['LGA']
data_SGA = bin_frames_to_r['SGA']

#data_Outcome.loc[0:50,'CohortType'] = 'NEU2'

%Rpush data_Outcome
%Rpush data_LGA
%Rpush data_SGA

In [37]:
fit_str_outcome = 'Outcome ~ babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + race_2.0 + race_3.0 + race_6.0 + race_97.0 + \
race_888.0 + race_999.0 + BMI + UTAS + (1|CohortType)' 

fit_str_SGA = 'SGA ~ babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + race_2.0 + race_3.0 + race_6.0 + race_97.0 + \
race_888.0 + race_999.0 + BMI + UTAS + (1|CohortType)' 

fit_str_LGA = 'LGA ~ babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + \
education_5.0 + race_2.0 + race_3.0 + race_6.0 + race_97.0 + \
race_888.0 + race_999.0 + BMI + UTAS + (1|CohortType)' 



%Rpush fit_str_outcome
%Rpush fit_str_SGA
%Rpush fit_str_LGA

In [None]:
sink("lm.txt")
print(summary(lm(cars$speed ~ cars$dist)))
sink()  # returns output to the console

In [39]:
%%R -o back

library(lme4)

library(lmerTest)

print(fit_str_outcome)

m<-glmer(fit_str_outcome, data=data_Outcome, family = binomial)
sink("all_cohorts_Outcome_UTAS.txt")
print(summary(m))
sink()



[1] "Outcome ~ babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + education_5.0 + race_2.0 + race_3.0 + race_6.0 + race_97.0 + race_888.0 + race_999.0 + BMI + UTAS + (1|CohortType)"


R[write to console]: boundary (singular) fit: see help('isSingular')

R[write to console]: 
Correlation matrix not shown by default, as p = 14 > 12.
Use print(summary(m), correlation=TRUE)  or
    vcov(summary(m))        if you need it




In [27]:
%%R
library(lme4)

library(lmerTest)

print(fit_str_SGA)
m<-glmer(fit_str_SGA, data=data_SGA, family = binomial)
sink("all_cohorts_SGA_UTAS.txt")
print(summary(m))
sink()

[1] "SGA ~ babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + education_5.0 + race_2.0 + race_3.0 + race_6.0 + race_97.0 + race_888.0 + race_999.0 + BMI + UTAS + (1|CohortType)"


R[write to console]: Error: grouping factors must have > 1 sampled level




Error: grouping factors must have > 1 sampled level


RInterpreterError: Failed to parse and evaluate line 'library(lme4)\n\nlibrary(lmerTest)\n\nprint(fit_str_SGA)\nm<-glmer(fit_str_SGA, data=data_SGA, family = binomial)\nback = print(summary(m))\n'.
R error message: 'Error: grouping factors must have > 1 sampled level'

In [28]:
%%R
library(lme4)

library(lmerTest)

print(fit_str_LGA)
m<-glmer(fit_str_LGA, data=data_LGA, family = binomial)
sink("all_cohorts_LGA_UTAS.txt")
print(summary(m))
sink()

[1] "LGA ~ babySex_2.0 + education_2.0 + education_3.0 + education_4.0 + education_5.0 + race_2.0 + race_3.0 + race_6.0 + race_97.0 + race_888.0 + race_999.0 + BMI + UTAS + (1|CohortType)"


R[write to console]: Error: grouping factors must have > 1 sampled level




Error: grouping factors must have > 1 sampled level


RInterpreterError: Failed to parse and evaluate line 'library(lme4)\n\nlibrary(lmerTest)\n\nprint(fit_str_LGA)\nm<-glmer(fit_str_LGA, data=data_LGA, family = binomial)\nback = print(summary(m))\n'.
R error message: 'Error: grouping factors must have > 1 sampled level'