In [1]:
import os
from pathlib import Path
from os import path

import geopandas as gpd
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm

from statsmodels.stats.outliers_influence import variance_inflation_factor

# 1. Input data

In [2]:
cwd = os.getcwd()
root_dir = Path(cwd).parent

In [3]:
# Input data

# Individual data
data_IND_tot = pd.read_csv(path.join(root_dir,
                                    "data/raw/individual_data.csv"))

# Household data
data_HH_tot = pd.read_csv(path.join(root_dir,
                                    "data/raw/household_data.csv"))

## Geodata (confidential, not publicly accessible)

# Mathare
mat_geo = gpd.read_file(path.join(root_dir,
                                  "data_confidential/preprocessed/data_Mat_HH_and_DiarrCases_blr.gpkg"))

# Mukuru
muk_geo = gpd.read_file(path.join(root_dir,
                                  "data_confidential/preprocessed/data_Muk_HH_and_DiarrCases_blr.gpkg"))

# 2. Preprocess data

## 2.1. Join household attributes to individuals

In [4]:
## Join tables

# Dummy dataframes
dst_data = data_IND_tot.copy().drop(['Unnamed: 0'],axis=1)
src_data = data_HH_tot.copy().drop(['Unnamed: 0','Site','uuid'],axis=1)
src_data = src_data.rename(columns={"KEY": "PARENT_KEY"})

# Join
dst_data = dst_data.merge(src_data, how='left', on='PARENT_KEY')

# Check results
dst_data

Unnamed: 0,KEY,PARENT_KEY,Site,Relation_to_HH,Age,Sex,School_past,Diarrhoea,HH_Number,DrinkingWaterGroup,...,HHITEMS/Microwave,HHITEMS/Oven,HHITEMS/Radio,HHITEMS/Refrigerator,HHITEMS/Smartphone,HHITEMS/Television,HHITEMS/Noitems,HHITEMS/NA,HHROOMS,StreetFood
0,uuid:2b72ea56-8a29-4874-94ae-d3f2b699f725/Rep_...,uuid:2b72ea56-8a29-4874-94ae-d3f2b699f725,Mathare,Head,25.0,M,Secondary,N,448,Public,...,0,0,1,0,1,0,0,0,1,5_Or_More
1,uuid:335fd59e-5c49-4d12-a143-04ed378cf332/Rep_...,uuid:335fd59e-5c49-4d12-a143-04ed378cf332,Mathare,Head,38.0,F,Primary,N,711,Public,...,0,0,1,0,1,0,0,0,1,2_4
2,uuid:335fd59e-5c49-4d12-a143-04ed378cf332/Rep_...,uuid:335fd59e-5c49-4d12-a143-04ed378cf332,Mathare,S_D,19.0,M,Secondary,N,711,Public,...,0,0,1,0,1,0,0,0,1,2_4
3,uuid:335fd59e-5c49-4d12-a143-04ed378cf332/Rep_...,uuid:335fd59e-5c49-4d12-a143-04ed378cf332,Mathare,S_D,12.0,F,,N,711,Public,...,0,0,1,0,1,0,0,0,1,2_4
4,uuid:90692aea-e337-41c7-8d8f-0b8083b037cc/Rep_...,uuid:90692aea-e337-41c7-8d8f-0b8083b037cc,Mathare,Head,24.0,M,Primary,N,298c,Public,...,0,0,1,0,0,0,0,0,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3781,uuid:86c835c1-c642-432c-9c69-177958aa9d29/Rep_...,uuid:86c835c1-c642-432c-9c69-177958aa9d29,Mukuru,W_H,22.0,F,Primary,N,327,Public,...,0,0,1,0,0,1,0,0,1,5_Or_More
3782,uuid:007e2f7b-0d59-4de9-aab1-3509c6c4566f/Rep_...,uuid:007e2f7b-0d59-4de9-aab1-3509c6c4566f,Mukuru,Head,36.0,M,Secondary,N,1253,Public,...,0,0,1,0,0,1,0,0,1,0_1
3783,uuid:007e2f7b-0d59-4de9-aab1-3509c6c4566f/Rep_...,uuid:007e2f7b-0d59-4de9-aab1-3509c6c4566f,Mukuru,W_H,28.0,F,,N,1253,Public,...,0,0,1,0,0,1,0,0,1,0_1
3784,uuid:007e2f7b-0d59-4de9-aab1-3509c6c4566f/Rep_...,uuid:007e2f7b-0d59-4de9-aab1-3509c6c4566f,Mukuru,S_D,9.0,M,,N,1253,Public,...,0,0,1,0,0,1,0,0,1,0_1


## 2.2. Set exposure variables and subsets (age groups)

In [5]:
## Set exposure variables necessary for analysis: water facilities' characteristics + potential confounders

# Dummy dataframe
df = dst_data.copy()


## Outcome of interest

# Diarrhea
diarr_mask = (df["Diarrhoea"]=='Y')
df['Case'] = np.nan
df['Case'][(~df["Diarrhoea"].isna())] = 0
df['Case'][diarr_mask] = 1


## Water services

# Recode access to "basic water" (following JMP definition) VS any other service type (including unimproved)
wt_basic_mask = (((df["DrinkingWaterGroup"]=='Private')&
                  (df["DrinkingWater_Private"].isin(['Piped_dwel','Piped_yard'])))|
                 ((df["DrinkingWaterGroup"]=='Private')&
                  (df["DrinkingWater_Private"].isin(['Piped_neigh']))&
                  (df["DrinkingWaterDist"]<=30))|
                 ((df["DrinkingWaterGroup"]=='Public')&
                  (df["DrinkingWater_Public"].isin(['Public_Tap','Vendor','Public_Dispenser','Ground_tube']))&
                  (df["DrinkingWaterDist"]<=30)))
df["BasicWt"] = np.nan
df["BasicWt"][(~df["DrinkingWaterGroup"].isna())] = 0
df["BasicWt"][wt_basic_mask] = 1

# Recode distance to water source (whether 30 min or less for a round-trip)
wt_near_mask = ((df["DrinkingWaterGroup"]=='Private')&
                (df["DrinkingWater_Private"].isin(['Piped_dwel','Piped_yard']))|
                ((df["DrinkingWaterDist"]<=30)&(~df["DrinkingWaterDist"].isna())))
df["WtSrcNear"] = np.nan
df["WtSrcNear"][(~df["DrinkingWaterGroup"].isna())] = 0
df["WtSrcNear"][wt_near_mask] = 1
    
# Recode access to water piped to premises VS any other service type (including unimproved)
wt_piped_mask = (df["DrinkingWater_Private"].isin(['Piped_dwel','Piped_yard'])) # water piped to own dwelling or yard
df["PipedWt"] = np.nan
df["PipedWt"][(~df["DrinkingWaterGroup"].isna())] = 0
df["PipedWt"][wt_piped_mask] = 1

# Recode access to piped water from neighbor VS any other service type (including unimproved)
wt_pipnb_mask = ((df["DrinkingWater_Private"].isin(['Piped_neigh']))&
                 (df["DrinkingWaterDist"]<=30)) # water piped to a neighbor's dwelling or yard
df["PipNbWt"] = np.nan
df["PipNbWt"][(~df["DrinkingWaterGroup"].isna())] = 0
df["PipNbWt"][wt_pipnb_mask] = 1

# Recode use of water from public water taps/dispensers VS any other service type (including unimproved)
wt_pubtp_mask = ((df["DrinkingWater_Public"].isin(['Public_Tap','Public_Dispenser']))&
                 (df["DrinkingWaterDist"]<=30)) # water from public tap or dispenser ('kiosk'/'ATM')
df["PbTapWt"] = np.nan
df["PbTapWt"][(~df["DrinkingWaterGroup"].isna())] = 0
df["PbTapWt"][wt_pubtp_mask] = 1

# Recode use of water from commercial source (in a public location) VS any other service type (including unimproved)
wt_cmsrc_mask = ((df["DrinkingWater_Public"]=='Vendor')&
                 (df["DrinkingWaterDist"]<=30)) # water mainly obtained from street vendors
df["CmSrcWt"] = np.nan
df["CmSrcWt"][(~df["DrinkingWaterGroup"].isna())] = 0
df["CmSrcWt"][wt_cmsrc_mask] = 1

# Recode use of water from street vendors VS any other service type (including unimproved)
wt_stven_mask = ((df["DrinkingWater_Public"]=='Vendor')&
                 (df["DrinkingWater_PublicVendor"].isin(['Vendor_Cart', 'Vendor_Bottled', 'Vendor_Sachet']))&
                 (df["DrinkingWaterDist"]<=30)) # water mainly obtained from street vendors
df["StVenWt"] = np.nan
df["StVenWt"][(~df["DrinkingWaterGroup"].isna())] = 0
df["StVenWt"][wt_stven_mask] = 1

# Recode water storage (whether household stores water they consume)
wt_strg = (df["DrinkingWaterContainer"].isna())
df["WtStored"] = np.nan
df["WtStored"][~wt_strg] = 1
df["WtStored"][wt_strg] = 0

# Recode water storage recipient (whether household uses closed recipient, when getting water from outside premises)
wt_recipok = (((df['DrinkingWaterContainer'].isin(['Plastic_Closed']))&
               (df['DrinkingWaterContainer2'].isin(['Plastic_Closed','Metal_Closed'])))|
              ((df['DrinkingWaterContainer'].isin(['Plastic_Closed']))&
               (df['DrinkingWaterContainer2'].isin(['Only_one']))))
df["WtStrgOk"] = np.nan
df["WtStrgOk"][~df["DrinkingWaterContainer"].isna()] = 0
df["WtStrgOk"][wt_recipok] = 1

# Recode water availability in the past month (self-reported)
wt_avail_mask = (df["DrinkingWaterAvailability"]=='N') # no lack of water in the previous month
df["AvailblWtM"] = np.nan
df["AvailblWtM"][~df["DrinkingWaterAvailability"].isna()] = 0
df["AvailblWtM"][wt_avail_mask] = 1

# Recode treatment of drinking water (well treated X not well treated, or not treated at all)
wt_treat_mask = ((df["DrinkingWaterTreatment"]=="Y")&
                 (df["DrinkingWaterTreatment2"].isin(['BleachChlorine',
                                                      'Boil',
                                                      'Boil BleachChlorine',
                                                      'BleachChlorine Boil'])))
df["WtTreatment"] = np.nan
df["WtTreatment"][((~df["DrinkingWaterTreatment"].isna())&
                  (~df["DrinkingWaterTreatment2"].isna()))|
                  (df["DrinkingWaterTreatment"]=="N")] = 0
df["WtTreatment"][wt_treat_mask] = 1


## Sanitation services

# Recode sanitation variable: "improved" (according to WHO-UNICEF's JMP) VS any other facitilty type (including unimproved and O.D.)
imprv_sn_mask = ((((df.ToiletFacilityTYPE=='DryOrCompost')&
                   (df.ToiletFacilityTYPE_Dry.isin(['Dry_ImprSlab',
                                                    'Dry_VIP',
                                                    'Dry_FreshLife'])))|
                  ((df.ToiletFacilityTYPE=='Flush')&
                   (df.ToiletFacilityTYPE_Flush.isin(['Flush_piped',
                                                      'Flush_septic_tank',
                                                      'Flush_coveredPit']))))
                 #&(df.ToiletFacilitySHARE=='N') # obs.: 'improved' sanitation does not require private facility
                )
df["ImprvSan"] = np.nan
df["ImprvSan"][(~df.ToiletFacility.isna())] = 0
df["ImprvSan"][imprv_sn_mask] = 1


## Hygiene services

# Recode "basic" hygiene (adapted, excluding 'mobile' item)
basic_hg_mask = ((df["ObsHandWashWATER"]=='Water_OK') # water available at the moment of the survey
                 &(df["ObsHandWashSOAP"]=='SoapOrDeterg') # soap (or equivalent) available at the moment of the survey
                 &(df["ObsHandWashPLACE"].isin(['Obs_Fixed'])) # presence of fixed structure to wash hands
                )
df["BasicHyg"] = np.nan
df["BasicHyg"][(~df["ObsHandWashWATER"].isna())&(~df["ObsHandWashSOAP"].isna())&(~df["ObsHandWashPLACE"].isna())] = 0
df["BasicHyg"][basic_hg_mask] = 1


## Housing conditions

# Recode overcrowding (whether number of habitants per room > 3, following 'slum' definition in SDG 11)
hab_cnt_df = df[['KEY','PARENT_KEY']].groupby('PARENT_KEY').count().reset_index() # count habitants per household
hab_cnt_df = hab_cnt_df.rename(columns={"KEY": "Cnt_Hab"})
df = df.merge(hab_cnt_df,how='left',on='PARENT_KEY') # join variable with count of houshehold members
df['HHROOMS'][df['HHROOMS'].isin([111,999])] = np.nan # exclude invalid answers & outliers regarding rooms
df['HabPerRoom'] = df['Cnt_Hab']/df['HHROOMS']
hiden_mask = (df["HabPerRoom"]>3) # high density if number of household members per room > 3
df["HiDensity"] = np.nan
df["HiDensity"][~df["HabPerRoom"].isna()] = 0
df["HiDensity"][hiden_mask] = 1


## External contamination pathways

# Recode frequency of consumption of street food
st_food_mask = (df["StreetFood"].isin(['2_4','5_Or_More'])) # twice or more per week
df["StFood"] = np.nan
df["StFood"][(~df["StreetFood"].isna())] = 0
df["StFood"][st_food_mask] = 1


## Potential confounders

# Wealth based on households assets
df["ItemsScore"]=(df["HHITEMS/Computer"]+df["HHITEMS/Electricity"]+
                  df["HHITEMS/Internet"]+df["HHITEMS/Microwave"]+
                  df["HHITEMS/Oven"]+df["HHITEMS/Radio"]+
                  df["HHITEMS/Refrigerator"]+df["HHITEMS/Smartphone"]+
                  df["HHITEMS/Television"])
df["ItemsScore"][(df["HHITEMS/Noitems"]==1)] = 0
df["ItemsScore"][(df["HHITEMS/NA"]==1)] = np.nan
wealth_mask = (df["ItemsScore"]>df["ItemsScore"].mean())# Number of items above the mean (of the surveyed households)
df["WealthyHH"] = np.nan
df["WealthyHH"][~df["ItemsScore"].isna()] = 0
df["WealthyHH"][wealth_mask] = 1

# Education level of heads of households
src = data_IND_tot[data_IND_tot.Relation_to_HH=='Head'][['PARENT_KEY',
                                                         'School_past']]# subset ed. level of HH
# Recode education level of heads of households
src['SecEduHH'] = np.nan
src['SecEduHH'][src['School_past'].isin(['No_Edu',
                                         'Early_CdE',
                                         'Primary'])] = 0 # up to primary education
src['SecEduHH'][src['School_past'].isin(['Secondary',
                                         'High_Ed'])] = 1 # secondary or higher education
src = src[~src.SecEduHH.isna()]
# In case a same household has 2 heads, merge lines
src = src[['SecEduHH','PARENT_KEY']].groupby(by="PARENT_KEY").max().reset_index()
# Attribute education level of heads of households
df = df.merge(src,on="PARENT_KEY",how='left')

In [6]:
## Subsets

# General population
dst_data = df.copy()
print("N (individuals) for general pop.:",dst_data.shape[0])

# Under 5, Abidjan
mask = (dst_data["Age"]<5)
dst_data_U5 = dst_data[mask]
print("N (individuals) for under 5:",dst_data_U5.shape[0])

N (individuals) for general pop.: 3786
N (individuals) for under 5: 491


# 3. Odds ratios from multiple logistic regressions

## 3.1. Exploratory analysis to identify candidate explanatory variables (to be included in logistic model)

### 3.1.1. Bivariate (unadjusted) odds ratio analysis to identify candidate explanatory variables for diarrhoea

In [7]:
## Set list of potentially relevant variables

# Exploratory list
explorat_lst = [# Exposures of interest (water sources)
                'PipedWt',
                'PipNbWt',
                'PbTapWt',
                'CmSrcWt',
                'StVenWt',
                # WASH - general
                'ImprvSan', # access to improved sanitation
                'BasicHyg', # presence of hand-washing materials AND fixed structure to wash hands (observed)
                'WtSrcNear', # whether round-trip to the water source takes 30 min or less
                'WtStored', # whether water is stored in a recipient (only when obtained from a source out of premises)
                'WtStrgOk', # whether water storage recipient is adequate (closed)
                'WtTreatment', # whether water is treated before drinking (bleach / chlorine / boil)
                'AvailblWtM', # water available during the month preceeding the survey (self-reported)
                # Housing conditions
                'HiDensity', # whether household is overcrowded
                # External factors
                'StFood', # frequent consumption of street foods (twice or more per week)
                # Potential confounders
                'WealthyHH', # whether household has more assets than the average
                'SecEduHH', # whether head of household has secondary education
               ]

# Subsets list 1: general population
subsets_gen = [dst_data]
subsets_gen_str = ['general population']

# Subsets list 2: households with children under five years old
subsets_cU5 = [dst_data_U5]
subsets_cU5_str = ['children under five']

In [8]:
# Exclude variables with only '0's or '1's
for data in [dst_data,dst_data_U5]:
    excl_list = []
    for var in explorat_lst:
        if data[var].value_counts().shape[0]==1:
            excl_list = excl_list+[var]
if (len(excl_list)>0):
    print('the following variables will be excluded ( n = ',len(excl_list),' ):',excl_list)
    explorat_lst = list(set(explorat_lst)-set(excl_list))
else:
    print('all good')

all good


In [9]:
## Risk of diarrhoea, general population

# Dummy dataset (general population)
df = dst_data.copy()

# Calculate unadjusted odds ratios
outcome = "Case"
df_oddsr_gen = pd.DataFrame()
for idx, subset in enumerate(subsets_gen):
    print("------------------ ",subsets_gen_str[idx]," ------------------")
    for exposure in explorat_lst:   
        # Data and logistic model
        data_reg = df[[outcome]+[exposure]].copy()
        data_reg = data_reg.dropna()
        logit = sm.Logit(data_reg[outcome], sm.add_constant(data_reg[exposure]))
        result = logit.fit()

        # Print the source type
        print('==============================================================================')
        print('                              Variable:',exposure)

        # Print the summary of the model
        print(result.summary())

        # Calculate the odds ratios and confidence intervals
        params = result.params
        conf = result.conf_int()
        signif = result.pvalues
        modelpval = result.llr_pvalue
        odds_ratios = np.exp(params)
        conf_lower = np.exp(conf[0])
        conf_upper = np.exp(conf[1])

        # Print the odds ratios and confidence intervals
        or_strat = pd.DataFrame({'Subset': subsets_gen_str[idx],
                                 'OR': odds_ratios,
                                 'signif': signif,
                                 'Lower CI': conf_lower,
                                 'Upper CI': conf_upper}).reset_index()
        or_strat['Significance'] = ''
        if modelpval < 0.05:
            or_strat['Significance'][or_strat['signif']>=0.1] = 'Not significant'
            or_strat['Significance'][or_strat['signif']<0.1] = '*'
            or_strat['Significance'][or_strat['signif']<0.05] = '**'
            or_strat['Significance'][or_strat['signif']<0.01] = '***'
            or_strat['Significance'][or_strat['signif']<0.001] = '****'
        else:
            or_strat['Significance'] = 'Not significant (LLR p-value > 5%)'
        or_strat = or_strat.drop(['signif'],axis=1)
        or_strat.columns = ['Exposure','Subset','Unadjusted OR','Lower CI (95%)','Upper CI (95%)','Significance']
        or_strat = or_strat.iloc[1:]
        df_oddsr_gen = pd.concat([df_oddsr_gen,
                                  or_strat[or_strat['Exposure']==exposure]],
                                 ignore_index=True)

df_oddsr_gen

------------------  general population  ------------------
Optimization terminated successfully.
         Current function value: 0.367092
         Iterations 6
                              Variable: PipedWt
                           Logit Regression Results                           
Dep. Variable:                   Case   No. Observations:                 3764
Model:                          Logit   Df Residuals:                     3762
Method:                           MLE   Df Model:                            1
Date:                Thu, 02 Nov 2023   Pseudo R-squ.:               6.816e-06
Time:                        20:06:20   Log-Likelihood:                -1381.7
converged:                       True   LL-Null:                       -1381.7
Covariance Type:            nonrobust   LLR p-value:                    0.8908
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
c

                              Variable: AvailblWtM
                           Logit Regression Results                           
Dep. Variable:                   Case   No. Observations:                 3761
Model:                          Logit   Df Residuals:                     3759
Method:                           MLE   Df Model:                            1
Date:                Thu, 02 Nov 2023   Pseudo R-squ.:                 0.01166
Time:                        20:06:20   Log-Likelihood:                -1365.3
converged:                       True   LL-Null:                       -1381.4
Covariance Type:            nonrobust   LLR p-value:                 1.383e-08
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.8004      0.058    -30.980      0.000      -1.914      -1.687
AvailblWtM    -0.6342      0.116     -5.450      0.000      -0.862      -0.406
O

Unnamed: 0,Exposure,Subset,Unadjusted OR,Lower CI (95%),Upper CI (95%),Significance
0,PipedWt,general population,0.975,0.678512,1.401044,Not significant (LLR p-value > 5%)
1,PipNbWt,general population,0.972777,0.705282,1.341725,Not significant (LLR p-value > 5%)
2,PbTapWt,general population,0.910301,0.725459,1.142239,Not significant (LLR p-value > 5%)
3,CmSrcWt,general population,1.410475,0.909054,2.188473,Not significant (LLR p-value > 5%)
4,StVenWt,general population,1.383276,0.816281,2.344113,Not significant (LLR p-value > 5%)
5,ImprvSan,general population,0.956622,0.777332,1.177265,Not significant (LLR p-value > 5%)
6,BasicHyg,general population,0.909103,0.584218,1.414655,Not significant (LLR p-value > 5%)
7,WtSrcNear,general population,0.203866,0.033973,1.22337,Not significant (LLR p-value > 5%)
8,WtStored,general population,1.024967,0.713285,1.472843,Not significant (LLR p-value > 5%)
9,WtStrgOk,general population,0.901397,0.72477,1.121068,Not significant (LLR p-value > 5%)


In [10]:
## Risk of diarrhoea, under fives subsets

# Dummy dataset (general population)
df = dst_data_U5.copy()

# Calculate unadjusted odds ratios
outcome = "Case"
df_oddsr_cU5 = pd.DataFrame()
for idx, subset in enumerate(subsets_cU5):
    print("------------------ ",subsets_cU5_str[idx]," ------------------")
    for exposure in explorat_lst:   
        # Data and logistic model
        data_reg = df[[outcome]+[exposure]].copy()
        data_reg = data_reg.dropna()
        logit = sm.Logit(data_reg[outcome], sm.add_constant(data_reg[exposure]))
        result = logit.fit()

        # Print the source type
        print('==============================================================================')
        print('                              Variable:',exposure)

        # Print the summary of the model
        print(result.summary())

        # Calculate the odds ratios and confidence intervals
        params = result.params
        conf = result.conf_int()
        signif = result.pvalues
        modelpval = result.llr_pvalue
        odds_ratios = np.exp(params)
        conf_lower = np.exp(conf[0])
        conf_upper = np.exp(conf[1])

        # Print the odds ratios and confidence intervals
        or_strat = pd.DataFrame({'Subset': subsets_cU5_str[idx],
                                 'OR': odds_ratios,
                                 'signif': signif,
                                 'Lower CI': conf_lower,
                                 'Upper CI': conf_upper}).reset_index()
        or_strat['Significance'] = ''
        if modelpval < 0.05:
            or_strat['Significance'][or_strat['signif']>=0.1] = 'Not significant'
            or_strat['Significance'][or_strat['signif']<0.1] = '*'
            or_strat['Significance'][or_strat['signif']<0.05] = '**'
            or_strat['Significance'][or_strat['signif']<0.01] = '***'
            or_strat['Significance'][or_strat['signif']<0.001] = '****'
        else:
            or_strat['Significance'] = 'Not significant (LLR p-value > 5%)'
        or_strat = or_strat.drop(['signif'],axis=1)
        or_strat.columns = ['Exposure','Subset','Unadjusted OR','Lower CI (95%)','Upper CI (95%)','Significance']
        or_strat = or_strat.iloc[1:]
        df_oddsr_cU5 = pd.concat([df_oddsr_cU5,
                                  or_strat[or_strat['Exposure']==exposure]],
                                 ignore_index=True)

df_oddsr_cU5

------------------  children under five  ------------------
Optimization terminated successfully.
         Current function value: 0.571618
         Iterations 5
                              Variable: PipedWt
                           Logit Regression Results                           
Dep. Variable:                   Case   No. Observations:                  491
Model:                          Logit   Df Residuals:                      489
Method:                           MLE   Df Model:                            1
Date:                Thu, 02 Nov 2023   Pseudo R-squ.:               4.971e-05
Time:                        20:06:27   Log-Likelihood:                -280.66
converged:                       True   LL-Null:                       -280.68
Covariance Type:            nonrobust   LLR p-value:                    0.8673
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------


Optimization terminated successfully.
         Current function value: 0.563446
         Iterations 5
                              Variable: WealthyHH
                           Logit Regression Results                           
Dep. Variable:                   Case   No. Observations:                  489
Model:                          Logit   Df Residuals:                      487
Method:                           MLE   Df Model:                            1
Date:                Thu, 02 Nov 2023   Pseudo R-squ.:                 0.01255
Time:                        20:06:27   Log-Likelihood:                -275.53
converged:                       True   LL-Null:                       -279.03
Covariance Type:            nonrobust   LLR p-value:                  0.008146
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.7807      0.143     -5.448      0.000    

Unnamed: 0,Exposure,Subset,Unadjusted OR,Lower CI (95%),Upper CI (95%),Significance
0,PipedWt,children under five,1.066793,0.5011954,2.270667,Not significant (LLR p-value > 5%)
1,PipNbWt,children under five,2.078846,1.175136,3.677532,**
2,PbTapWt,children under five,0.609929,0.3905037,0.9526503,**
3,CmSrcWt,children under five,1.059211,0.4344806,2.582225,Not significant (LLR p-value > 5%)
4,StVenWt,children under five,0.562903,0.1602438,1.977363,Not significant (LLR p-value > 5%)
5,ImprvSan,children under five,1.097659,0.7156662,1.683545,Not significant (LLR p-value > 5%)
6,BasicHyg,children under five,1.920139,0.6858211,5.375941,Not significant (LLR p-value > 5%)
7,WtSrcNear,children under five,5e-06,3.590127e-223,7.877358e+211,Not significant (LLR p-value > 5%)
8,WtStored,children under five,0.937389,0.4403992,1.99523,Not significant (LLR p-value > 5%)
9,WtStrgOk,children under five,1.418761,0.8748304,2.300885,Not significant (LLR p-value > 5%)


In [11]:
## List of significant covariates

# General population
list_gen = df_oddsr_gen['Exposure'][~df_oddsr_gen['Significance'].isin(['Not significant (LLR p-value > 5%)',
                                                                        'Not significant'])].to_list()

# Under fives subset
list_cu5 = df_oddsr_cU5['Exposure'][~df_oddsr_cU5['Significance'].isin(['Not significant (LLR p-value > 5%)',
                                                                        'Not significant'])].to_list()

# Merge lists
merged_lst = list(set(list_gen+list_cu5))

# Dataframe containing only selected covariates, by age group
df_raw_or = pd.concat([df_oddsr_gen[df_oddsr_gen['Exposure'].isin(merged_lst)],
                       df_oddsr_cU5[df_oddsr_cU5['Exposure'].isin(merged_lst)]],
                      ignore_index=True)

# Check results
print("Selected covariates (n=",len(merged_lst),"):",merged_lst)
df_raw_or

Selected covariates (n= 8 ): ['StFood', 'AvailblWtM', 'WealthyHH', 'WtTreatment', 'SecEduHH', 'PbTapWt', 'PipNbWt', 'HiDensity']


Unnamed: 0,Exposure,Subset,Unadjusted OR,Lower CI (95%),Upper CI (95%),Significance
0,PipNbWt,general population,0.972777,0.705282,1.341725,Not significant (LLR p-value > 5%)
1,PbTapWt,general population,0.910301,0.725459,1.142239,Not significant (LLR p-value > 5%)
2,WtTreatment,general population,0.808209,0.65848,0.991985,**
3,AvailblWtM,general population,0.530371,0.422206,0.666246,****
4,HiDensity,general population,1.13882,0.932947,1.390124,Not significant (LLR p-value > 5%)
5,StFood,general population,1.618643,1.276188,2.052993,****
6,WealthyHH,general population,0.833447,0.684486,1.014826,Not significant (LLR p-value > 5%)
7,SecEduHH,general population,0.836364,0.677306,1.032775,Not significant (LLR p-value > 5%)
8,PipNbWt,children under five,2.078846,1.175136,3.677532,**
9,PbTapWt,children under five,0.609929,0.390504,0.95265,**


### 3.1.2. Selection of control variables: test for multicollinearity with Variance Inflation Factor
- Criterion: variables with a VIF > 5 shall be discarded

In [12]:
# Set VIF threshold
vif_t = 5

# Set model's control variables (list of relevant covariates - list of exposures of interest)
exposure_list = ['BasicWt','PipedWt','PipNbWt','PbTapWt','CmSrcWt','StVenWt']
control_list = list(sorted(set(merged_lst)-set(exposure_list)))

# Dummy dataframes: keep only observations with valid answers for all selected variables
dataset = dst_data[control_list].copy().dropna()

# Create table for Variance Inflation Factor (VIF)
vif_scores = pd.DataFrame() 
vif_scores["Attribute"] = control_list

# Calculate VIF for each feature
vif_var = "VIF Scores"
vif_scores[vif_var] = [variance_inflation_factor(dataset[control_list].values, i) for i in range(len(dataset[control_list].columns))] 

# Assess variables' VIF
if vif_scores[vif_var].max() < vif_t:
    print("All good: no significant multicollinearity")
    # View results
    display(vif_scores)
else:
    drop_v_list = vif_scores["Attribute"][vif_scores[vif_var]>vif_t].to_list()
    print(">> WARNING: multicollinearity detected for:",
          drop_v_list)
    for var in drop_v_list:
        print(str(round(dataset[var].mean(),4)*100)+
              "% of individuals in selected households had a positive outcome for",
              '"',var,'"')
    # View results
    display(vif_scores)
    # Set model's variables
    print(">> REMOVING variables causing multicollinearity:", drop_v_list)
    control_list = list(sorted(set(control_list)-set(drop_v_list)))
    print(">> NEW LIST:", control_list)
    # Dummy dataframes: keep only observations with valid answers for all selected variables
    dataset = dst_data[control_list].copy().dropna()
    # Create table for Variance Inflation Factor (VIF)
    vif_scores = pd.DataFrame() 
    vif_scores["Attribute"] = control_list
    # Calculate VIF for each feature
    vif_var = "VIF Scores"
    vif_scores[vif_var] = [variance_inflation_factor(dataset[control_list].values, i) for i in range(len(dataset[control_list].columns))] 
    # View results
    display(vif_scores)
    # Assess variables' VIF
    if vif_scores[vif_var].max() < vif_t:
        print("All good: no significant multicollinearity")
    else:
        drop_v_list = vif_scores["Attribute"][vif_scores[vif_var]>vif_t].to_list()
        print(">> WARNING: multicollinearity still detected for:",
              drop_v_list)
        for var in drop_v_list:
            print(str(round(dataset[var].mean(),4)*100)+
                  "% of individuals in selected households had a positive outcome for",
                  '"',var,'"')

All good: no significant multicollinearity


Unnamed: 0,Attribute,VIF Scores
0,AvailblWtM,1.369212
1,HiDensity,1.926206
2,SecEduHH,1.735976
3,StFood,2.166987
4,WealthyHH,1.99757
5,WtTreatment,1.536555


## 3.2. Adjusted odds of diarrhoea in the general population by water source (adjusted by sanitaton, hygiene and potentially significant variables identified in step 3.1)

In [13]:
# Set VIF threshold
vif_t = 5

for var in exposure_list:
    print("------------------------ ",var," ------------------------")
    # Dummy dataframes: keep only observations with valid answers for all selected variables
    dataset = dst_data[[outcome]+control_list+[var]].copy().dropna()

    # Create table for Variance Inflation Factor (VIF)
    vif_scores = pd.DataFrame() 
    vif_scores["Attribute"] = [outcome]+control_list+[var]

    # Calculate VIF for each feature
    vif_var = "VIF Scores"
    vif_scores[vif_var] = [variance_inflation_factor(dataset[[outcome]+control_list+[var]].values, i) for i in range(len(dataset[[outcome]+control_list+[var]].columns))] 

    # Assess variables' VIF
    if vif_scores[vif_var].max() < vif_t:
        print("All good: no significant multicollinearity")
    else:
        drop_v_list = vif_scores["Attribute"][vif_scores[vif_var]>vif_t].to_list()
        print(">> WARNING: multicollinearity detected for:",
              drop_v_list)
        for var in drop_v_list:
            print(str(round(dataset[var].mean(),4)*100)+
                  "% of individuals in selected households had a positive outcome for",
                  '"',var,'"')
        print(">> REMOVING variables causing multicollinearity:", drop_v_list)
        exposure_list = list(sorted(set(exposure_list)-set(drop_v_list)))
        print(">> NEW LIST:", exposure_list)

    # View results
    display(vif_scores)

------------------------  BasicWt  ------------------------
99.42999999999999% of individuals in selected households had a positive outcome for " BasicWt "
>> REMOVING variables causing multicollinearity: ['BasicWt']
>> NEW LIST: ['CmSrcWt', 'PbTapWt', 'PipNbWt', 'PipedWt', 'StVenWt']


Unnamed: 0,Attribute,VIF Scores
0,Case,1.152501
1,AvailblWtM,1.525697
2,HiDensity,2.263004
3,SecEduHH,1.965956
4,StFood,3.366364
5,WealthyHH,2.233615
6,WtTreatment,1.735752
7,BasicWt,8.286728


------------------------  PipedWt  ------------------------
All good: no significant multicollinearity


Unnamed: 0,Attribute,VIF Scores
0,Case,1.133671
1,AvailblWtM,1.372492
2,HiDensity,1.939908
3,SecEduHH,1.756742
4,StFood,2.230364
5,WealthyHH,2.019632
6,WtTreatment,1.544176
7,PipedWt,1.11082


------------------------  PipNbWt  ------------------------
All good: no significant multicollinearity


Unnamed: 0,Attribute,VIF Scores
0,Case,1.133542
1,AvailblWtM,1.373574
2,HiDensity,1.936621
3,SecEduHH,1.765736
4,StFood,2.272433
5,WealthyHH,2.002377
6,WtTreatment,1.55299
7,PipNbWt,1.118506


------------------------  PbTapWt  ------------------------
All good: no significant multicollinearity


Unnamed: 0,Attribute,VIF Scores
0,Case,1.136893
1,AvailblWtM,1.448814
2,HiDensity,2.084533
3,SecEduHH,1.756747
4,StFood,2.593226
5,WealthyHH,2.079091
6,WtTreatment,1.579052
7,PbTapWt,3.184695


------------------------  CmSrcWt  ------------------------
All good: no significant multicollinearity


Unnamed: 0,Attribute,VIF Scores
0,Case,1.134317
1,AvailblWtM,1.372978
2,HiDensity,1.936029
3,SecEduHH,1.744315
4,StFood,2.234801
5,WealthyHH,2.003066
6,WtTreatment,1.539303
7,CmSrcWt,1.031384


------------------------  StVenWt  ------------------------
All good: no significant multicollinearity


Unnamed: 0,Attribute,VIF Scores
0,Case,1.133571
1,AvailblWtM,1.382115
2,HiDensity,1.936042
3,SecEduHH,1.740134
4,StFood,2.240878
5,WealthyHH,2.001776
6,WtTreatment,1.539389
7,StVenWt,1.02297


In [14]:
# General population dataset
df = dst_data.copy()

# Create OR dataframe (initially empty)
or_wtsource_diarr_gen = pd.DataFrame()

# Discard exposure variable if all observations are exposed, or non-exposed (OR not plausible)
drop_list = []
print('==============================================================================')
print('                         CHECKING EXPOSURE VARIABLES:')
print('')
for var in exposure_list:
    if len(df[var].value_counts().values)==1:
        drop_list = drop_list+[var]
        print('>> WARNING : dropped',var,'( only 0s or 1s )')
if len(drop_list)==0:
    print('>> All good!')
elif len(drop_list)>0:
    exposure_list = sorted(list(set(exposure_list)-set(drop_list))) # update exposure variables list if needed
print('==============================================================================')

# Run logistic regressions for each source individually
for exposure in exposure_list:
    
    # Set exposure list
    if exposure in control_list:
        var_list = list(set(control_list)-set([exposure]))+[exposure]
    else:
        var_list = control_list+[exposure]
    
    # Data and logistic model
    data_reg = df[[outcome]+var_list].copy()
    data_reg = data_reg.dropna()
    logit = sm.Logit(data_reg[outcome], sm.add_constant(data_reg[var_list]))
    result = logit.fit()
    
    # Print the source type
    print('==============================================================================')
    print('                              Variable:',exposure)
    
    # Print the summary of the model
    print(result.summary())
    
    # Calculate the odds ratios and confidence intervals
    params = result.params
    conf = result.conf_int()
    signif = result.pvalues
    modelpval = result.llr_pvalue
    odds_ratios = np.exp(params)
    conf_lower = np.exp(conf[0])
    conf_upper = np.exp(conf[1])

    # Print the odds ratios and confidence intervals
    or_strat = pd.DataFrame({'OR': odds_ratios,
                             'signif': signif,
                             'Lower CI': conf_lower,
                             'Upper CI': conf_upper}).reset_index()
    or_strat['Significance'] = ''
    if modelpval < 0.05:
        or_strat['Significance'][or_strat['signif']>=0.1] = 'Not significant'
        or_strat['Significance'][or_strat['signif']<0.1] = '*'
        or_strat['Significance'][or_strat['signif']<0.05] = '**'
        or_strat['Significance'][or_strat['signif']<0.01] = '***'
        or_strat['Significance'][or_strat['signif']<0.001] = '****'
    else:
        or_strat['Significance'] = 'Not significant (LLR p-value > 5%)'
    or_strat = or_strat.drop(['signif'],axis=1)
    or_strat.columns = ['Exposure','Adjusted OR','Lower CI (95%)','Upper CI (95%)','Significance']
    or_strat = or_strat.iloc[1:]
    or_wtsource_diarr_gen = pd.concat([or_wtsource_diarr_gen,
                                       or_strat[or_strat['Exposure']==exposure]],
                                      ignore_index=True)
    or_wtsource_diarr_gen = or_wtsource_diarr_gen.replace({'Exposure':{'PipedWt':'Water piped to premises',
                                                                       'PipNbWt':'Water obtained from neighbor (piped)',
                                                                       'PbTapWt':'Water obtained from public tap',
                                                                       'CmSrcWt':'Water obtained from a commercial source (vendor or boozer)',
                                                                       'StVenWt':'Water obtained from street vendor'
                                                                      }})

# View adjusted ORs for each source
or_wtsource_diarr_gen

                         CHECKING EXPOSURE VARIABLES:

>> All good!
Optimization terminated successfully.
         Current function value: 0.361897
         Iterations 6
                              Variable: CmSrcWt
                           Logit Regression Results                           
Dep. Variable:                   Case   No. Observations:                 3174
Model:                          Logit   Df Residuals:                     3166
Method:                           MLE   Df Model:                            7
Date:                Thu, 02 Nov 2023   Pseudo R-squ.:                 0.02059
Time:                        20:06:34   Log-Likelihood:                -1148.7
converged:                       True   LL-Null:                       -1172.8
Covariance Type:            nonrobust   LLR p-value:                 3.128e-08
                  coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------

Unnamed: 0,Exposure,Adjusted OR,Lower CI (95%),Upper CI (95%),Significance
0,Water obtained from a commercial source (vendo...,1.750858,0.963257,3.182434,*
1,Water obtained from public tap,0.898566,0.698682,1.155632,Not significant
2,Water obtained from neighbor (piped),0.901058,0.640284,1.268038,Not significant
3,Water piped to premises,1.174659,0.804715,1.714674,Not significant
4,Water obtained from street vendor,1.817386,0.732283,4.510403,Not significant


## 3.3. Adjusted odds of diarrhoea in children under five by water source (adjusted by sanitaton, hygiene and potentially significant variables identified in step 3.1)

In [15]:
# Set VIF threshold
vif_t = 5

for var in exposure_list:
    print("------------------------ ",var," ------------------------")
    # Dummy dataframes: keep only observations with valid answers for all selected variables
    dataset = dst_data_U5[[outcome]+control_list+[var]].copy().dropna()

    # Create table for Variance Inflation Factor (VIF)
    vif_scores = pd.DataFrame() 
    vif_scores["Attribute"] = [outcome]+control_list+[var]

    # Calculate VIF for each feature
    vif_var = "VIF Scores"
    vif_scores[vif_var] = [variance_inflation_factor(dataset[[outcome]+control_list+[var]].values, i) for i in range(len(dataset[[outcome]+control_list+[var]].columns))] 

    # Assess variables' VIF
    if vif_scores[vif_var].max() < vif_t:
        print("All good: no significant multicollinearity")
    else:
        drop_v_list = vif_scores["Attribute"][vif_scores[vif_var]>vif_t].to_list()
        print(">> WARNING: multicollinearity detected for:",
              drop_v_list)
        for var in drop_v_list:
            print(str(round(dataset[var].mean(),4)*100)+
                  "% of individuals in selected households had a positive outcome for",
                  '"',var,'"')
        print(">> REMOVING variables causing multicollinearity:", drop_v_list)
        exposure_list = list(sorted(set(exposure_list)-set(drop_v_list)))
        print(">> NEW LIST:", exposure_list)

    # View results
    display(vif_scores)

------------------------  CmSrcWt  ------------------------
All good: no significant multicollinearity


Unnamed: 0,Attribute,VIF Scores
0,Case,1.40563
1,AvailblWtM,1.407428
2,HiDensity,2.282686
3,SecEduHH,1.969436
4,StFood,2.384727
5,WealthyHH,2.122898
6,WtTreatment,1.735518
7,CmSrcWt,1.041748


------------------------  PbTapWt  ------------------------
All good: no significant multicollinearity


Unnamed: 0,Attribute,VIF Scores
0,Case,1.400111
1,AvailblWtM,1.478165
2,HiDensity,2.45509
3,SecEduHH,2.024722
4,StFood,2.661603
5,WealthyHH,2.185227
6,WtTreatment,1.772289
7,PbTapWt,3.228832


------------------------  PipNbWt  ------------------------
All good: no significant multicollinearity


Unnamed: 0,Attribute,VIF Scores
0,Case,1.427559
1,AvailblWtM,1.407192
2,HiDensity,2.284394
3,SecEduHH,1.993988
4,StFood,2.405475
5,WealthyHH,2.121612
6,WtTreatment,1.760809
7,PipNbWt,1.168432


------------------------  PipedWt  ------------------------
All good: no significant multicollinearity


Unnamed: 0,Attribute,VIF Scores
0,Case,1.403344
1,AvailblWtM,1.407658
2,HiDensity,2.289674
3,SecEduHH,1.986078
4,StFood,2.397877
5,WealthyHH,2.160593
6,WtTreatment,1.73498
7,PipedWt,1.133591


------------------------  StVenWt  ------------------------
All good: no significant multicollinearity


Unnamed: 0,Attribute,VIF Scores
0,Case,1.397844
1,AvailblWtM,1.417768
2,HiDensity,2.283525
3,SecEduHH,1.975128
4,StFood,2.390184
5,WealthyHH,2.109104
6,WtTreatment,1.734804
7,StVenWt,1.026415


In [16]:
# Children under five dataset
df = dst_data_U5.copy()

# Create OR dataframe (initially empty)
or_wtsource_diarr_u5 = pd.DataFrame()

# Discard exposure variable if all observations are exposed, or non-exposed (OR not plausible)
drop_list = []
print('==============================================================================')
print('                         CHECKING EXPOSURE VARIABLES:')
print('')
for var in exposure_list:
    if len(df[var].value_counts().values)==1:
        drop_list = drop_list+[var]
        print('>> WARNING : dropped',var,'( only 0s or 1s )')
if len(drop_list)==0:
    print('>> All good!')
elif len(drop_list)>0:
    exposure_list = sorted(list(set(exposure_list)-set(drop_list))) # update exposure variables list if needed
print('==============================================================================')

# Run logistic regressions for each source individually
for exposure in exposure_list:
    
    # Set exposure list
    if exposure in control_list:
        var_list = list(set(control_list)-set([exposure]))+[exposure]
    else:
        var_list = control_list+[exposure]
    
    # Data and logistic model
    data_reg = df[[outcome]+var_list].copy()
    data_reg = data_reg.dropna()
    logit = sm.Logit(data_reg[outcome], sm.add_constant(data_reg[var_list]))
    result = logit.fit()
    
    # Print the source type
    print('==============================================================================')
    print('                              Variable:',exposure)
    
    # Print the summary of the model
    print(result.summary())
    
    # Calculate the odds ratios and confidence intervals
    params = result.params
    conf = result.conf_int()
    signif = result.pvalues
    modelpval = result.llr_pvalue
    odds_ratios = np.exp(params)
    conf_lower = np.exp(conf[0])
    conf_upper = np.exp(conf[1])

    # Print the odds ratios and confidence intervals
    or_strat = pd.DataFrame({'OR': odds_ratios,
                             'signif': signif,
                             'Lower CI': conf_lower,
                             'Upper CI': conf_upper}).reset_index()
    or_strat['Significance'] = ''
    if modelpval < 0.05:
        or_strat['Significance'][or_strat['signif']>=0.1] = 'Not significant'
        or_strat['Significance'][or_strat['signif']<0.1] = '*'
        or_strat['Significance'][or_strat['signif']<0.05] = '**'
        or_strat['Significance'][or_strat['signif']<0.01] = '***'
        or_strat['Significance'][or_strat['signif']<0.001] = '****'
    else:
        or_strat['Significance'] = 'Not significant (LLR p-value > 5%)'
    or_strat = or_strat.drop(['signif'],axis=1)
    or_strat.columns = ['Exposure','Adjusted OR (under-5)','Lower CI (95%)','Upper CI (95%)','Significance']
    or_strat = or_strat.iloc[1:]
    or_wtsource_diarr_u5 = pd.concat([or_wtsource_diarr_u5,
                                      or_strat[or_strat['Exposure']==exposure]],
                                     ignore_index=True)
    or_wtsource_diarr_u5 = or_wtsource_diarr_u5.replace({'Exposure':{'PipedWt':'Water piped to premises',
                                                                     'PipNbWt':'Water obtained from neighbor (piped)',
                                                                     'PbTapWt':'Water obtained from public tap',
                                                                     'CmSrcWt':'Water obtained from a commercial source (vendor or boozer)',
                                                                     'StVenWt':'Water obtained from street vendor'
                                                                    }})

# View adjusted ORs for each source
or_wtsource_diarr_u5

                         CHECKING EXPOSURE VARIABLES:

>> All good!
Optimization terminated successfully.
         Current function value: 0.548482
         Iterations 6
                              Variable: CmSrcWt
                           Logit Regression Results                           
Dep. Variable:                   Case   No. Observations:                  410
Model:                          Logit   Df Residuals:                      402
Method:                           MLE   Df Model:                            7
Date:                Thu, 02 Nov 2023   Pseudo R-squ.:                 0.05286
Time:                        20:06:48   Log-Likelihood:                -224.88
converged:                       True   LL-Null:                       -237.43
Covariance Type:            nonrobust   LLR p-value:                 0.0007286
                  coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------

Unnamed: 0,Exposure,Adjusted OR (under-5),Lower CI (95%),Upper CI (95%),Significance
0,Water obtained from a commercial source (vendo...,2.49936,0.717631,8.704746,Not significant
1,Water obtained from public tap,0.474645,0.285321,0.789595,***
2,Water obtained from neighbor (piped),1.987153,1.056843,3.736391,**
3,Water piped to premises,1.644843,0.721363,3.75055,Not significant
4,Water obtained from street vendor,0.844222,0.082852,8.602165,Not significant


# 4. Supplementary analyses

## 4.1. Associations between lack of water and diarrhea

In [19]:
# Relevant variables list (outcome, exposure and control variables)
outcome = 'Case'
exposure_list = ['AvailblWtM']
control_list = list(sorted(set(merged_lst)-set(exposure_list+['PbTapWt','PipNbWt'])))

### 4.1.2. Water availability & diarrhea - General population

In [21]:
## UNADJUSTED ORs - all ages

# General population dataset
df = dst_data.copy()

# Create OR dataframe (initially empty)
or_wtavar_diarr_gen = pd.DataFrame()

# Discard exposure variable if all observations are exposed, or non-exposed (OR not plausible)
drop_list = []
print('==============================================================================')
print('                         CHECKING EXPOSURE VARIABLES:')
print('')
for var in exposure_list:
    if len(df[var].value_counts().values)==1:
        drop_list = drop_list+[var]
        print('>> WARNING : dropped',var,'( only 0s or 1s )')
if len(drop_list)==0:
    print('>> All good!')
elif len(drop_list)>0:
    exposure_list = sorted(list(set(exposure_list)-set(drop_list))) # update exposure variables list if needed
print('==============================================================================')

# Run logistic regressions for each source individually
for exposure in exposure_list:
    
    # Set exposure list
    if exposure in control_list:
        var_list = [exposure]
    else:
        var_list = [exposure]
    
    # Data and logistic model
    data_reg = df[[outcome]+var_list].copy()
    data_reg = data_reg.dropna()
    logit = sm.Logit(data_reg[outcome], sm.add_constant(data_reg[var_list]))
    result = logit.fit()
    
    # Print the source type
    print('==============================================================================')
    print('                              Variable:',exposure)
    
    # Print the summary of the model
    print(result.summary())
    
    # Calculate the odds ratios and confidence intervals
    params = result.params
    conf = result.conf_int()
    signif = result.pvalues
    modelpval = result.llr_pvalue
    odds_ratios = np.exp(params)
    conf_lower = np.exp(conf[0])
    conf_upper = np.exp(conf[1])

    # Print the odds ratios and confidence intervals
    or_strat = pd.DataFrame({'OR': odds_ratios,
                             'signif': signif,
                             'Lower CI': conf_lower,
                             'Upper CI': conf_upper}).reset_index()
    or_strat['Significance'] = ''
    if modelpval < 0.05:
        or_strat['Significance'][or_strat['signif']>=0.1] = 'Not significant'
        or_strat['Significance'][or_strat['signif']<0.1] = '*'
        or_strat['Significance'][or_strat['signif']<0.05] = '**'
        or_strat['Significance'][or_strat['signif']<0.01] = '***'
        or_strat['Significance'][or_strat['signif']<0.001] = '****'
    else:
        or_strat['Significance'] = 'Not significant (LLR p-value > 5%)'
    or_strat = or_strat.drop(['signif'],axis=1)
    or_strat.columns = ['Exposure','Unadjusted OR','Lower CI (95%)','Upper CI (95%)','Significance']
    or_strat = or_strat.iloc[1:]
    or_wtavar_diarr_gen = pd.concat([or_wtavar_diarr_gen,
                                     or_strat[or_strat['Exposure']==exposure]],
                                    ignore_index=True)
    or_wtavar_diarr_gen = or_wtavar_diarr_gen.replace({'Exposure':{'PipedWt':'Water piped to premises',
                                                                   'PbTapWt':'Water obtained from public tap',
                                                                   'StVenWt':'Water obtained from street vendor',
                                                                  }})

# View adjusted ORs for each source
or_wtavar_diarr_gen

                         CHECKING EXPOSURE VARIABLES:

>> All good!
Optimization terminated successfully.
         Current function value: 0.363003
         Iterations 6
                              Variable: AvailblWtM
                           Logit Regression Results                           
Dep. Variable:                   Case   No. Observations:                 3761
Model:                          Logit   Df Residuals:                     3759
Method:                           MLE   Df Model:                            1
Date:                Thu, 02 Nov 2023   Pseudo R-squ.:                 0.01166
Time:                        20:08:24   Log-Likelihood:                -1365.3
converged:                       True   LL-Null:                       -1381.4
Covariance Type:            nonrobust   LLR p-value:                 1.383e-08
                 coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------

Unnamed: 0,Exposure,Unadjusted OR,Lower CI (95%),Upper CI (95%),Significance
0,AvailblWtM,0.530371,0.422206,0.666246,****


In [22]:
# Set VIF threshold
vif_t = 5

# Dummy dataframes: keep only observations with valid answers for all selected variables
dataset = dst_data[[outcome]+control_list+exposure_list].copy().dropna()

# Create table for Variance Inflation Factor (VIF)
vif_scores = pd.DataFrame() 
vif_scores["Attribute"] = [outcome]+control_list+exposure_list

# Calculate VIF for each feature
vif_var = "VIF Scores"
vif_scores[vif_var] = [variance_inflation_factor(dataset[[outcome]+control_list+exposure_list].values, i) for i in range(len(dataset[[outcome]+control_list+exposure_list].columns))] 

# Assess variables' VIF
if vif_scores[vif_var].max() < vif_t:
    print("All good: no significant multicollinearity")
else:
    drop_v_list = vif_scores["Attribute"][vif_scores[vif_var]>vif_t].to_list()
    print(">> WARNING: multicollinearity detected for:",
          drop_v_list)    
    for var in drop_v_list:
        print(str(round(dataset[var].mean(),4)*100)+
              "% of individuals in selected households had a positive outcome for",
              '"',var,'"')

# View results
display(vif_scores)

All good: no significant multicollinearity


Unnamed: 0,Attribute,VIF Scores
0,Case,1.133163
1,HiDensity,1.935013
2,SecEduHH,1.739428
3,StFood,2.229135
4,WealthyHH,2.000087
5,WtTreatment,1.539238
6,AvailblWtM,1.372167


In [23]:
## ADJUSTED ORs - all ages

# General population dataset
df = dst_data.copy()

# Relevant variables list (outcome, exposure and control variables)
outcome = 'Case'
exposure_list = ['AvailblWtM']
control_list = list(sorted(set(control_list)-set(exposure_list)))

# Create OR dataframe (initially empty)
or_wtavar_diarr_gen = pd.DataFrame()

# Discard exposure variable if all observations are exposed, or non-exposed (OR not plausible)
drop_list = []
print('==============================================================================')
print('                         CHECKING EXPOSURE VARIABLES:')
print('')
for var in exposure_list:
    if len(df[var].value_counts().values)==1:
        drop_list = drop_list+[var]
        print('>> WARNING : dropped',var,'( only 0s or 1s )')
if len(drop_list)==0:
    print('>> All good!')
elif len(drop_list)>0:
    exposure_list = sorted(list(set(exposure_list)-set(drop_list))) # update exposure variables list if needed
print('==============================================================================')

# Run logistic regressions for each source individually
for exposure in exposure_list:
    
    # Set exposure list
    if exposure in control_list:
        var_list = list(set(control_list)-set([exposure]))+[exposure]
    else:
        var_list = control_list+[exposure]
    
    # Data and logistic model
    data_reg = df[[outcome]+var_list].copy()
    data_reg = data_reg.dropna()
    logit = sm.Logit(data_reg[outcome], sm.add_constant(data_reg[var_list]))
    result = logit.fit()
    
    # Print the source type
    print('==============================================================================')
    print('                              Variable:',exposure)
    
    # Print the summary of the model
    print(result.summary())
    
    # Calculate the odds ratios and confidence intervals
    params = result.params
    conf = result.conf_int()
    signif = result.pvalues
    modelpval = result.llr_pvalue
    odds_ratios = np.exp(params)
    conf_lower = np.exp(conf[0])
    conf_upper = np.exp(conf[1])

    # Print the odds ratios and confidence intervals
    or_strat = pd.DataFrame({'OR': odds_ratios,
                             'signif': signif,
                             'Lower CI': conf_lower,
                             'Upper CI': conf_upper}).reset_index()
    or_strat['Significance'] = ''
    if modelpval < 0.05:
        or_strat['Significance'][or_strat['signif']>=0.1] = 'Not significant'
        or_strat['Significance'][or_strat['signif']<0.1] = '*'
        or_strat['Significance'][or_strat['signif']<0.05] = '**'
        or_strat['Significance'][or_strat['signif']<0.01] = '***'
        or_strat['Significance'][or_strat['signif']<0.001] = '****'
    else:
        or_strat['Significance'] = 'Not significant (LLR p-value > 5%)'
    or_strat = or_strat.drop(['signif'],axis=1)
    or_strat.columns = ['Exposure','Adjusted OR','Lower CI (95%)','Upper CI (95%)','Significance']
    or_strat = or_strat.iloc[1:]
    or_wtavar_diarr_gen = pd.concat([or_wtavar_diarr_gen,
                                     or_strat[or_strat['Exposure']==exposure]],
                                    ignore_index=True)
    or_wtavar_diarr_gen = or_wtavar_diarr_gen.replace({'Exposure':{'PipedWt':'Water piped to premises',
                                                                   'PbTapWt':'Water obtained from public tap',
                                                                   'StVenWt':'Water obtained from street vendor',
                                                                  }})

# View adjusted ORs for each source
or_wtavar_diarr_gen

                         CHECKING EXPOSURE VARIABLES:

>> All good!
Optimization terminated successfully.
         Current function value: 0.362210
         Iterations 6
                              Variable: AvailblWtM
                           Logit Regression Results                           
Dep. Variable:                   Case   No. Observations:                 3176
Model:                          Logit   Df Residuals:                     3169
Method:                           MLE   Df Model:                            6
Date:                Thu, 02 Nov 2023   Pseudo R-squ.:                 0.01934
Time:                        20:08:35   Log-Likelihood:                -1150.4
converged:                       True   LL-Null:                       -1173.1
Covariance Type:            nonrobust   LLR p-value:                 3.956e-08
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------

Unnamed: 0,Exposure,Adjusted OR,Lower CI (95%),Upper CI (95%),Significance
0,AvailblWtM,0.526836,0.407201,0.68162,****


### 4.1.2. Water availability & diarrhea - Under fives

In [24]:
## UNADJUSTED ORs - under fives

# General population dataset
df = dst_data_U5.copy()

# Relevant variables list (outcome, exposure and control variables)
outcome = 'Case'
exposure_list = ['AvailblWtM']

# Create OR dataframe (initially empty)
or_wtavar_diarr_gen = pd.DataFrame()

# Discard exposure variable if all observations are exposed, or non-exposed (OR not plausible)
drop_list = []
print('==============================================================================')
print('                         CHECKING EXPOSURE VARIABLES:')
print('')
for var in exposure_list:
    if len(df[var].value_counts().values)==1:
        drop_list = drop_list+[var]
        print('>> WARNING : dropped',var,'( only 0s or 1s )')
if len(drop_list)==0:
    print('>> All good!')
elif len(drop_list)>0:
    exposure_list = sorted(list(set(exposure_list)-set(drop_list))) # update exposure variables list if needed
print('==============================================================================')

# Run logistic regressions for each source individually
for exposure in exposure_list:
    
    # Set exposure list
    if exposure in control_list:
        var_list = [exposure]
    else:
        var_list = [exposure]
    
    # Data and logistic model
    data_reg = df[[outcome]+var_list].copy()
    data_reg = data_reg.dropna()
    logit = sm.Logit(data_reg[outcome], sm.add_constant(data_reg[var_list]))
    result = logit.fit()
    
    # Print the source type
    print('==============================================================================')
    print('                              Variable:',exposure)
    
    # Print the summary of the model
    print(result.summary())
    
    # Calculate the odds ratios and confidence intervals
    params = result.params
    conf = result.conf_int()
    signif = result.pvalues
    modelpval = result.llr_pvalue
    odds_ratios = np.exp(params)
    conf_lower = np.exp(conf[0])
    conf_upper = np.exp(conf[1])

    # Print the odds ratios and confidence intervals
    or_strat = pd.DataFrame({'OR': odds_ratios,
                             'signif': signif,
                             'Lower CI': conf_lower,
                             'Upper CI': conf_upper}).reset_index()
    or_strat['Significance'] = ''
    if modelpval < 0.05:
        or_strat['Significance'][or_strat['signif']>=0.1] = 'Not significant'
        or_strat['Significance'][or_strat['signif']<0.1] = '*'
        or_strat['Significance'][or_strat['signif']<0.05] = '**'
        or_strat['Significance'][or_strat['signif']<0.01] = '***'
        or_strat['Significance'][or_strat['signif']<0.001] = '****'
    else:
        or_strat['Significance'] = 'Not significant (LLR p-value > 5%)'
    or_strat = or_strat.drop(['signif'],axis=1)
    or_strat.columns = ['Exposure','Unadjusted OR (under-5)','Lower CI (95%)','Upper CI (95%)','Significance']
    or_strat = or_strat.iloc[1:]
    or_wtavar_diarr_gen = pd.concat([or_wtavar_diarr_gen,
                                     or_strat[or_strat['Exposure']==exposure]],
                                    ignore_index=True)
    or_wtavar_diarr_gen = or_wtavar_diarr_gen.replace({'Exposure':{'PipedWt':'Water piped to premises',
                                                           'PbTapWt':'Water obtained from public tap',
                                                           'StVenWt':'Water obtained from street vendor',
                                                          }})

# View adjusted ORs for each source
or_wtavar_diarr_gen

                         CHECKING EXPOSURE VARIABLES:

>> All good!
Optimization terminated successfully.
         Current function value: 0.564524
         Iterations 5
                              Variable: AvailblWtM
                           Logit Regression Results                           
Dep. Variable:                   Case   No. Observations:                  490
Model:                          Logit   Df Residuals:                      488
Method:                           MLE   Df Model:                            1
Date:                Thu, 02 Nov 2023   Pseudo R-squ.:                 0.01342
Time:                        20:09:50   Log-Likelihood:                -276.62
converged:                       True   LL-Null:                       -280.38
Covariance Type:            nonrobust   LLR p-value:                  0.006088
                 coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------

Unnamed: 0,Exposure,Unadjusted OR (under-5),Lower CI (95%),Upper CI (95%),Significance
0,AvailblWtM,0.546396,0.351244,0.849976,***


In [25]:
# Set VIF threshold
vif_t = 5

# Dummy dataframes: keep only observations with valid answers for all selected variables
dataset = dst_data_U5[[outcome]+control_list+exposure_list].copy().dropna()

# Create table for Variance Inflation Factor (VIF)
vif_scores = pd.DataFrame() 
vif_scores["Attribute"] = [outcome]+control_list+exposure_list

# Calculate VIF for each feature
vif_var = "VIF Scores"
vif_scores[vif_var] = [variance_inflation_factor(dataset[[outcome]+control_list+exposure_list].values, i) for i in range(len(dataset[[outcome]+control_list+exposure_list].columns))] 

# Assess variables' VIF
if vif_scores[vif_var].max() < vif_t:
    print("All good: no significant multicollinearity")
else:
    drop_v_list = vif_scores["Attribute"][vif_scores[vif_var]>vif_t].to_list()
    print(">> WARNING: multicollinearity detected for:",
          drop_v_list)    
    for var in drop_v_list:
        print(str(round(dataset[var].mean(),4)*100)+
              "% of individuals in selected households had a positive outcome for",
              '"',var,'"')

# View results
display(vif_scores)

All good: no significant multicollinearity


Unnamed: 0,Attribute,VIF Scores
0,Case,1.397732
1,HiDensity,2.282679
2,SecEduHH,1.969436
3,StFood,2.37691
4,WealthyHH,2.109041
5,WtTreatment,1.733651
6,AvailblWtM,1.406887


In [26]:
## ADJUSTED ORs - under fives

# Children under five dataset
df = dst_data_U5.copy()

# Create OR dataframe (initially empty)
or_wtavar_diarr_u5 = pd.DataFrame()

# Discard exposure variable if all observations are exposed, or non-exposed (OR not plausible)
drop_list = []
print('==============================================================================')
print('                         CHECKING EXPOSURE VARIABLES:')
print('')
for var in exposure_list:
    if len(df[var].value_counts().values)==1:
        drop_list = drop_list+[var]
        print('>> WARNING : dropped',var,'( only 0s or 1s )')
if len(drop_list)==0:
    print('>> All good!')
elif len(drop_list)>0:
    exposure_list = sorted(list(set(exposure_list)-set(drop_list))) # update exposure variables list if needed
print('==============================================================================')

# Run logistic regressions for each source individually
for exposure in exposure_list:
    
    # Set exposure list
    if exposure in control_list:
        var_list = list(set(control_list)-set([exposure]))+[exposure]
    else:
        var_list = control_list+[exposure]
    
    # Data and logistic model
    data_reg = df[[outcome]+var_list].copy()
    data_reg = data_reg.dropna()
    logit = sm.Logit(data_reg[outcome], sm.add_constant(data_reg[var_list]))
    result = logit.fit()
    
    # Print the source type
    print('==============================================================================')
    print('                              Variable:',exposure)
    
    # Print the summary of the model
    print(result.summary())
    
    # Calculate the odds ratios and confidence intervals
    params = result.params
    conf = result.conf_int()
    signif = result.pvalues
    modelpval = result.llr_pvalue
    odds_ratios = np.exp(params)
    conf_lower = np.exp(conf[0])
    conf_upper = np.exp(conf[1])

    # Print the odds ratios and confidence intervals
    or_strat = pd.DataFrame({'OR': odds_ratios,
                             'signif': signif,
                             'Lower CI': conf_lower,
                             'Upper CI': conf_upper}).reset_index()
    or_strat['Significance'] = ''
    if modelpval < 0.05:
        or_strat['Significance'][or_strat['signif']>=0.1] = 'Not significant'
        or_strat['Significance'][or_strat['signif']<0.1] = '*'
        or_strat['Significance'][or_strat['signif']<0.05] = '**'
        or_strat['Significance'][or_strat['signif']<0.01] = '***'
        or_strat['Significance'][or_strat['signif']<0.001] = '****'
    else:
        or_strat['Significance'] = 'Not significant (LLR p-value > 5%)'
    or_strat = or_strat.drop(['signif'],axis=1)
    or_strat.columns = ['Exposure','Adjusted OR (under-5)','Lower CI (95%)','Upper CI (95%)','Significance']
    or_strat = or_strat.iloc[1:]
    or_wtavar_diarr_u5 = pd.concat([or_wtavar_diarr_u5,
                                    or_strat[or_strat['Exposure']==exposure]],
                                   ignore_index=True)
    or_wtavar_diarr_u5 = or_wtavar_diarr_u5.replace({'Exposure':{'PipedWt':'Water piped to premises',
                                                           'PbTapWt':'Water obtained from public tap',
                                                           'StVenWt':'Water obtained from street vendor',
                                                          }})

# View adjusted ORs for each source
or_wtavar_diarr_u5

                         CHECKING EXPOSURE VARIABLES:

>> All good!
Optimization terminated successfully.
         Current function value: 0.550891
         Iterations 6
                              Variable: AvailblWtM
                           Logit Regression Results                           
Dep. Variable:                   Case   No. Observations:                  410
Model:                          Logit   Df Residuals:                      403
Method:                           MLE   Df Model:                            6
Date:                Thu, 02 Nov 2023   Pseudo R-squ.:                 0.04870
Time:                        20:10:12   Log-Likelihood:                -225.87
converged:                       True   LL-Null:                       -237.43
Covariance Type:            nonrobust   LLR p-value:                 0.0007561
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------

Unnamed: 0,Exposure,Adjusted OR (under-5),Lower CI (95%),Upper CI (95%),Significance
0,AvailblWtM,0.658523,0.402246,1.07808,*


## 4.2. Descriptive statistics

### 4.2.1. All households
Basic demographic statistics for the population living in participating households

In [27]:
## All valid household surveys

# Dummy dataframe
df = dst_data.copy()

In [28]:
## Number of individuals

# Both sites
tot_ind = df.shape[0]
tot_hh = len(df.PARENT_KEY.unique())

# Mathare
tot_ind_mat = df[df['Site']=='Mathare'].shape[0]
tot_hh_mat = len(df[df['Site']=='Mathare'].PARENT_KEY.unique())

# Mukuru
tot_ind_muk = df[df['Site']=='Mukuru'].shape[0]
tot_hh_muk = len(df[df['Site']=='Mukuru'].PARENT_KEY.unique())

# Print stats
print('Total individuals:',tot_ind)
print('Total households:',tot_hh)
print('----------------------------------------')
print('Total individuals in Mathare:',tot_ind_mat)
print('Total households in Mathare:',tot_hh_mat)
print('----------------------------------------')
print('Total individuals in Mukuru:',tot_ind_muk)
print('Total households in Mukuru:',tot_hh_muk)

Total individuals: 3786
Total households: 1147
----------------------------------------
Total individuals in Mathare: 1935
Total households in Mathare: 576
----------------------------------------
Total individuals in Mukuru: 1851
Total households in Mukuru: 571


In [29]:
## Number of individuals, by gender

# Both sites
tot_ind_m = df[df['Sex']=='M'].shape[0]
tot_ind_f = df[df['Sex']=='F'].shape[0]

# Mathare
tot_ind_mat_m = df[(df['Sex']=='M')&(df['Site']=='Mathare')].shape[0]
tot_ind_mat_f = df[(df['Sex']=='F')&(df['Site']=='Mathare')].shape[0]

# Mukuru
tot_ind_muk_m = df[(df['Sex']=='M')&(df['Site']=='Mukuru')].shape[0]
tot_ind_muk_f = df[(df['Sex']=='F')&(df['Site']=='Mukuru')].shape[0]

# Print stats
print('Total male individuals:',tot_ind_m,
      '(',round((tot_ind_m/tot_ind)*100,2),'% )')
print('Total female individuals:',tot_ind_f,
      '(',round((tot_ind_f/tot_ind)*100,2),'% )')
print('----------------------------------------')
print('Total male individuals in Mathare:',tot_ind_mat_m,
      '(',round((tot_ind_mat_m/tot_ind_mat)*100,2),'% )')
print('Total female individuals in Mathare:',tot_ind_mat_f,
      '(',round((tot_ind_mat_f/tot_ind_mat)*100,2),'% )')
print('----------------------------------------')
print('Total male individuals in Mukuru:',tot_ind_muk_m,
      '(',round((tot_ind_muk_m/tot_ind_muk)*100,2),'% )')
print('Total female individuals in Mukuru:',tot_ind_muk_f,
      '(',round((tot_ind_muk_f/tot_ind_muk)*100,2),'% )')

Total male individuals: 1775 ( 46.88 % )
Total female individuals: 2011 ( 53.12 % )
----------------------------------------
Total male individuals in Mathare: 854 ( 44.13 % )
Total female individuals in Mathare: 1081 ( 55.87 % )
----------------------------------------
Total male individuals in Mukuru: 921 ( 49.76 % )
Total female individuals in Mukuru: 930 ( 50.24 % )


In [30]:
## Number of individuals, by age

# Both sites
tot_ind_ag_u5 = df[df['Age']<5].shape[0]
tot_ind_ag_ad = df[(df['Age']>18)&(df['Age']<=90)].shape[0]

# Azito
tot_ind_mat_ag_u5 = df[(df['Age']<5)&(df['Site']=='Mathare')].shape[0]
tot_ind_mat_ag_ad = df[(df['Age']>18)&(df['Age']<=90)&(df['Site']=='Mathare')].shape[0]

# Mukuru
tot_ind_muk_ag_u5 = df[(df['Age']<5)&(df['Site']=='Mukuru')].shape[0]
tot_ind_muk_ag_ad = df[(df['Age']>18)&(df['Age']<=90)&(df['Site']=='Mukuru')].shape[0]

# Print stats
print('Total individuals under five:',tot_ind_ag_u5,
      '(',round((tot_ind_ag_u5/tot_ind)*100,2),'% )')
print('Total adult individuals:',tot_ind_ag_ad,
      '(',round((tot_ind_ag_ad/tot_ind)*100,2),'% )')
print('----------------------------------------')
print('Total individuals under five in Mathare:',tot_ind_mat_ag_u5,
      '(',round((tot_ind_mat_ag_u5/tot_ind_mat)*100,2),'% )')
print('Total adult individuals in Mathare:',tot_ind_mat_ag_ad,
      '(',round((tot_ind_mat_ag_u5/tot_ind_mat)*100,2),'% )')
print('----------------------------------------')
print('Total individuals under five in Mukuru:',tot_ind_muk_ag_u5,
      '(',round((tot_ind_muk_ag_u5/tot_ind_muk)*100,2),'% )')
print('Total adult individuals in Mukuru:',tot_ind_muk_ag_ad,
      '(',round((tot_ind_muk_ag_u5/tot_ind_muk)*100,2),'% )')


Total individuals under five: 491 ( 12.97 % )
Total adult individuals: 2159 ( 57.03 % )
----------------------------------------
Total individuals under five in Mathare: 226 ( 11.68 % )
Total adult individuals in Mathare: 1091 ( 11.68 % )
----------------------------------------
Total individuals under five in Mukuru: 265 ( 14.32 % )
Total adult individuals in Mukuru: 1068 ( 14.32 % )


### 4.2.2. Households included in the models
Basic demographic statistics for the population living in participating households with valid answers for all variables included in the multiple logistic regressions

#### Notes
- Of relevance, this reduction did not affect the overall proportion of males/females or that of children younger than five years old, as shown below.

In [31]:
## Households with valid answers for all 8 variables included in the regressions

# Dummy dataframe
df = dst_data.copy()

# Subset data
df = df[['PARENT_KEY','Site','Case']+control_list+exposure_list+['Sex','Age']]
df = df.dropna()

In [32]:
## Number of individuals

# Both sites
tot_ind = df.shape[0]
tot_hh = len(df.PARENT_KEY.unique())

# Mathare
tot_ind_mat = df[df['Site']=='Mathare'].shape[0]
tot_hh_mat = len(df[df['Site']=='Mathare'].PARENT_KEY.unique())

# Mukuru
tot_ind_muk = df[df['Site']=='Mukuru'].shape[0]
tot_hh_muk = len(df[df['Site']=='Mukuru'].PARENT_KEY.unique())

# Print stats
print('Total individuals (MLRs):',tot_ind)
print('Total households (MLRs):',tot_hh)
print('----------------------------------------')
print('Total individuals in Mathare (MLRs):',tot_ind_mat)
print('Total households in Mathare (MLRs):',tot_hh_mat)
print('----------------------------------------')
print('Total individuals in Mukuru (MLRs):',tot_ind_muk)
print('Total households in Mukuru (MLRs):',tot_hh_muk)

Total individuals (MLRs): 3176
Total households (MLRs): 981
----------------------------------------
Total individuals in Mathare (MLRs): 1649
Total households in Mathare (MLRs): 502
----------------------------------------
Total individuals in Mukuru (MLRs): 1527
Total households in Mukuru (MLRs): 479


In [33]:
## Number of individuals, by gender

# Both sites
tot_ind_m = df[df['Sex']=='M'].shape[0]
tot_ind_f = df[df['Sex']=='F'].shape[0]

# Mathare
tot_ind_mat_m = df[(df['Sex']=='M')&(df['Site']=='Mathare')].shape[0]
tot_ind_mat_f = df[(df['Sex']=='F')&(df['Site']=='Mathare')].shape[0]

# Mukuru
tot_ind_muk_m = df[(df['Sex']=='M')&(df['Site']=='Mukuru')].shape[0]
tot_ind_muk_f = df[(df['Sex']=='F')&(df['Site']=='Mukuru')].shape[0]

# Print stats
print('Total male individuals (MLRs):',tot_ind_m,
      '(',round((tot_ind_m/tot_ind)*100,2),'% )')
print('Total female individuals (MLRs):',tot_ind_f,
      '(',round((tot_ind_f/tot_ind)*100,2),'% )')
print('----------------------------------------')
print('Total male individuals in Mathare (MLRs):',tot_ind_mat_m,
      '(',round((tot_ind_mat_m/tot_ind_mat)*100,2),'% )')
print('Total female individuals in Mathare (MLRs):',tot_ind_mat_f,
      '(',round((tot_ind_mat_f/tot_ind_mat)*100,2),'% )')
print('----------------------------------------')
print('Total male individuals in Mukuru (MLRs):',tot_ind_muk_m,
      '(',round((tot_ind_muk_m/tot_ind_muk)*100,2),'% )')
print('Total female individuals in Mukuru (MLRs):',tot_ind_muk_f,
      '(',round((tot_ind_muk_f/tot_ind_muk)*100,2),'% )')

Total male individuals (MLRs): 1483 ( 46.69 % )
Total female individuals (MLRs): 1693 ( 53.31 % )
----------------------------------------
Total male individuals in Mathare (MLRs): 722 ( 43.78 % )
Total female individuals in Mathare (MLRs): 927 ( 56.22 % )
----------------------------------------
Total male individuals in Mukuru (MLRs): 761 ( 49.84 % )
Total female individuals in Mukuru (MLRs): 766 ( 50.16 % )


In [34]:
## Number of individuals, by age

# Both sites
tot_ind_ag_u5 = df[df['Age']<5].shape[0]
tot_ind_ag_ad = df[(df['Age']>18)&(df['Age']<=90)].shape[0]

# Mathare
tot_ind_mat_ag_u5 = df[(df['Age']<5)&(df['Site']=='Mathare')].shape[0]
tot_ind_mat_ag_ad = df[(df['Age']>18)&(df['Age']<=90)&(df['Site']=='Mathare')].shape[0]

# Mukuru
tot_ind_muk_ag_u5 = df[(df['Age']<5)&(df['Site']=='Mukuru')].shape[0]
tot_ind_muk_ag_ad = df[(df['Age']>18)&(df['Age']<=90)&(df['Site']=='Mukuru')].shape[0]

# Print stats
print('Total individuals under five (MLRs):',tot_ind_ag_u5,
      '(',round((tot_ind_ag_u5/tot_ind)*100,2),'% )')
print('Total adult individuals (MLRs):',tot_ind_ag_ad,
      '(',round((tot_ind_ag_ad/tot_ind)*100,2),'% )')
print('----------------------------------------')
print('Total individuals under five in Mathare (MLRs):',tot_ind_mat_ag_u5,
      '(',round((tot_ind_mat_ag_u5/tot_ind_mat)*100,2),'% )')
print('Total adult individuals in Mathare (MLRs):',tot_ind_mat_ag_ad,
      '(',round((tot_ind_mat_ag_u5/tot_ind_mat)*100,2),'% )')
print('----------------------------------------')
print('Total individuals under five in Mukuru (MLRs):',tot_ind_muk_ag_u5,
      '(',round((tot_ind_muk_ag_u5/tot_ind_muk)*100,2),'% )')
print('Total adult individuals in Mukuru (MLRs):',tot_ind_muk_ag_ad,
      '(',round((tot_ind_muk_ag_u5/tot_ind_muk)*100,2),'% )')


Total individuals under five (MLRs): 410 ( 12.91 % )
Total adult individuals (MLRs): 1817 ( 57.21 % )
----------------------------------------
Total individuals under five in Mathare (MLRs): 193 ( 11.7 % )
Total adult individuals in Mathare (MLRs): 924 ( 11.7 % )
----------------------------------------
Total individuals under five in Mukuru (MLRs): 217 ( 14.21 % )
Total adult individuals in Mukuru (MLRs): 893 ( 14.21 % )
