In [1]:
import os
from pathlib import Path
from os import path

import geopandas as gpd
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm

from statsmodels.stats.outliers_influence import variance_inflation_factor

# 1. Input data

In [2]:
cwd = os.getcwd()
root_dir = Path(cwd).parent

In [3]:
## Input data

# Individual data
data_IND_tot = pd.read_csv(path.join(root_dir,
                                    "data/raw/individual_data.csv")).drop(['Unnamed: 0'],axis=1)

# Household data
data_HH_tot = pd.read_csv(path.join(root_dir,
                                    "data/raw/household_data.csv")).drop(['Unnamed: 0'],axis=1)

# 2. Preprocess data

## 2.1. Set exposure variables and subsets

In [4]:
## Set exposure variables necessary for analysis: WASH facilities' characteristics + potential confounders

# Dummy dataframe
df = data_HH_tot.copy()

## Outcome of interest

# Perceived safety to go to WC
wc_unsf_mask = ((df["ToiletFacilitySAFE"]=="During_Day")|
                (df["ToiletFacilitySAFE"]=="Unsafe")) # if safe only during the day, or never, toilet is considered unsafe
df["WCunsafe"] = np.nan
df["WCunsafe"][~df["ToiletFacilitySAFE"].isna()] = 0
df["WCunsafe"][wc_unsf_mask] = 1


## WASH facilities

# "Improved" sanitation (according to WHO-UNICEF's JMP) VS "unimproved"
isan_mask_abi = ((((df.City=='Abidjan')&
                   (df.ToiletFacilityTYPE=='DryOrCompost')&
                   (df.ToiletFacilityTYPE_Dry.isin(['Dry_ImprSlab', # improved pit latrine
                                                    'Dry_VIP'])))| # VIP: ventilated, improved pit latrine
                  ((df.City=='Abidjan')&
                   (df.ToiletFacilityTYPE=='Flush')&
                   (df.ToiletFacilityTYPE_Flush.isin(['Flush_piped',
                                                      'Flush_septic_tank',
                                                      'Flush_coveredPit']))))&(df.ToiletROOF=='Y')) # many toilets in Abidjan did not have a roof
isan_mask_nai = ((((df.City=='Nairobi')&
                   (df.ToiletFacilityTYPE=='DryOrCompost')&
                   (df.ToiletFacilityTYPE_Dry.isin(['Dry_ImprSlab', # improved pit latrine
                                                    'Dry_VIP', # VIP: ventilated, improved pit latrine
                                                    'Dry_FreshLife'])))| # Sanergy's FreshLife toilet unit
                  ((df.City=='Nairobi')&
                   (df.ToiletFacilityTYPE=='Flush')&
                   (df.ToiletFacilityTYPE_Flush.isin(['Flush_piped',
                                                      'Flush_septic_tank',
                                                      'Flush_coveredPit'])))))
df["ImprvSan"] = np.nan
df["ImprvSan"][(~df.ToiletFacility.isna())&(~df.ToiletFacility.isin(['NoFacility_Nature']))] = 0
df["ImprvSan"][isan_mask_abi] = 1
df["ImprvSan"][isan_mask_nai] = 1

# Shared toilet
wc_share_mask = (df["ToiletFacilitySHARE"]=='Y') # toilet shared by more than one household
df["WCshared"] = np.nan
df["WCshared"][(~df["ToiletFacilitySHARE"].isna())] = 0
df["WCshared"][wc_share_mask] = 1

# Recode location of toilet (within premises X out of premises, excluding open defecation from analysis)
public_sel = ["Public","Neighb_YardPlot","Neighb_Dwelling"] # any WC located OUT OF PREMISES
wc_loc_mask = (df["ToiletFacility"].isin(public_sel))
df["WCoutprem"] = np.nan
df["WCoutprem"][(~df["ToiletFacility"].isna())&(df["ToiletFacility"]!="NoFacility_Nature")] = 0
df["WCoutprem"][wc_loc_mask] = 1


## Recode presence of children

# Input: dummy dataframe
src_data = data_IND_tot.copy()
# Source data: get only variables of interest
src_cU5 = pd.DataFrame([src_data["PARENT_KEY"],src_data["Age"]]).transpose()
src_cU5 = src_cU5.rename(columns={"PARENT_KEY": "KEY"})
# New column: indicate presence of at least one child under-5
df["Child_U5"] = 0
lst_key_U5 = list(src_cU5["KEY"][src_cU5["Age"]<5]) # list unique IDs of households with child under 5
df["Child_U5"][df["KEY"].isin(lst_key_U5)] = 1


## Potential confounders

# Education level of heads of households
src = data_IND_tot[data_IND_tot.Relation_to_HH=='Head'][['PARENT_KEY',
                                                         'School_past']]# subset ed. level of HH
# heads of households with at least secondary education
src['SecEduHH'] = np.nan
src['SecEduHH'][src['School_past'].isin(['No_Edu',
                                         'Early_CdE',
                                         'Primary',
                                         'Coranic'])] = 0 # up to primary education
src['SecEduHH'][src['School_past'].isin(['Secondary',
                                         'Secondary_1',
                                         'Secondary_2',
                                         'High_Ed'])] = 1 # secondary or higher education
src = src[~src.SecEduHH.isna()]
# In case a same household has 2 heads, merge lines
src = src[['SecEduHH','PARENT_KEY']].groupby(by="PARENT_KEY").max().reset_index()
src = src.rename(columns={'PARENT_KEY':'KEY'})
# Attribute education level of heads of households
df = df.merge(src,on="KEY",how='left')

# Recode Sex of head of household (female = 1)
df['Sex_HH_F'] = np.nan
df['Sex_HH_F'][df['Sex_HH']=='M'] = 0
df['Sex_HH_F'][df['Sex_HH']=='F'] = 1

In [5]:
## Subsets

# Abidjan
df_abi = df[df['City']=='Abidjan'].copy()
print("N for general pop. in Abidjan:",df_abi.shape[0])

# Nairobi
df_nai = df[df['City']=='Nairobi'].copy()
print("N for general pop. in Nairobi:",df_nai.shape[0])

N for general pop. in Abidjan: 567
N for general pop. in Nairobi: 1147


# 3. Odds ratios

## 3.1. Lack of safety

### 3.1.1. Bivariate odds ratio analysis to identify candidate explanatory variables for lack of safety to access toilet

In [6]:
## Set list of exposure variables

# List
exposure_lst = ['WCoutprem', # most used toilet is located in a 'public' place (out of premises)
                'WCshared', # WC is shared by more than 1 household
                'SecEduHH', # head of household attained secondary education
                'Sex_HH_F', # respondent was female
               ]

# Subsets list 1: general population
subsets_gen = [df_abi,df_nai]
subsets_gen_str = ['respondents in Abidjan','respondents in Nairobi']

In [7]:
## Perceived lack of safety to go to toilets

# Calculate odds ratios
outcome_var = "WCunsafe"
outcome_pos = 1
outcome_neg = 0
df_oddsr_gen = pd.DataFrame()
for idx, subset in enumerate(subsets_gen):
    print("------------------ ",subsets_gen_str[idx]," ------------------")
    for exposure in exposure_lst:
        # define groups
        print("Variable:",exposure)
        exposure_grp = subset[subset[exposure]==1]
        no_exposure_grp = subset[subset[exposure]==0]
        exposure_grp = exposure_grp[~exposure_grp[outcome_var].isna()]
        no_exposure_grp = no_exposure_grp[~no_exposure_grp[outcome_var].isna()]
        # set table for Fisher tests
        table = np.array([[exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0], exposure_grp[exposure_grp[outcome_var]==outcome_neg].shape[0]],
                          [no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0], no_exposure_grp[no_exposure_grp[outcome_var]==outcome_neg].shape[0]]])
        # calculate proportion of households with at least 1 case
        exposure_grp_prop = exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0]/exposure_grp.shape[0]
        no_exposure_grp_prop = no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0]/no_exposure_grp.shape[0]
        # calculate 95% CI - exposure group
        P_exp = exposure_grp_prop
        N_exp = exposure_grp.shape[0]
        CI_exp = 1.96*(np.sqrt((P_exp*(1-P_exp))/N_exp))
        # calculate 95% CI - no exposure group
        P_ne = no_exposure_grp_prop
        N_ne = no_exposure_grp.shape[0]
        CI_ne = 1.96*(np.sqrt((P_ne*(1-P_ne))/N_ne))
        # run Fisher tests for OR = 1
        oddsratio_eq1, pvalue_eq1 = stats.fisher_exact(table)
        # run Fisher tests for OR > 1
        oddsratio_greater1, pvalue_greater1 = stats.fisher_exact(table,alternative="greater")
        # run Fisher tests for OR < 1
        oddsratio_less1, pvalue_less1 = stats.fisher_exact(table,alternative="less")
        # add results to dataframe
        df_oddsr_gen = df_oddsr_gen.append([[exposure,
                                             subsets_gen_str[idx],
                                             exposure_grp_prop*100,
                                             '±'+str(round(CI_exp*100,2)),
                                             no_exposure_grp_prop*100,
                                             '±'+str(round(CI_ne*100,2)),
                                             oddsratio_eq1,
                                             pvalue_eq1,
                                             pvalue_greater1,
                                             pvalue_less1,
                                             table
                                            ]])

# Reset columns' names & index
df_oddsr_gen.columns = ['exposure variable','stratum',
                        '% outcomes, exposed','95% CI, exposed',
                        '% outcomes, non-exposed','95% CI, non-exposed',
                        'OR','p-value for OR=1',
                        'p_OR_hi_1','p_OR_lo_1',
                        'table'
                       ]
df_oddsr_gen = df_oddsr_gen.reset_index()

------------------  respondents in Abidjan  ------------------
Variable: WCoutprem
Variable: WCshared
Variable: SecEduHH
Variable: Sex_HH_F
------------------  respondents in Nairobi  ------------------
Variable: WCoutprem
Variable: WCshared
Variable: SecEduHH
Variable: Sex_HH_F


In [8]:
## List of significant covariates

# Significance threshold (maximum p-value)
st = 0.05

# General population
list_gen = list(set(df_oddsr_gen['exposure variable'][df_oddsr_gen['p-value for OR=1']<st].to_list()))

# Dataframe containing only selected covariates, by age group
df_raw_or = df_oddsr_gen[df_oddsr_gen['exposure variable'].isin(list_gen)].drop(['p_OR_hi_1', 'p_OR_lo_1'],axis=1)

# Check results
print("Selected covariates (n=",len(list_gen),"):",list_gen)
df_raw_or

Selected covariates (n= 4 ): ['WCoutprem', 'WCshared', 'SecEduHH', 'Sex_HH_F']


Unnamed: 0,index,exposure variable,stratum,"% outcomes, exposed","95% CI, exposed","% outcomes, non-exposed","95% CI, non-exposed",OR,p-value for OR=1,table
0,0,WCoutprem,respondents in Abidjan,47.058824,±23.73,23.484848,±5.11,2.896057,0.0411949,"[[8, 9], [62, 202]]"
1,0,WCshared,respondents in Abidjan,25.833333,±5.54,17.5,±11.78,1.642055,0.3236228,"[[62, 178], [7, 33]]"
2,0,SecEduHH,respondents in Abidjan,16.049383,±7.99,28.484848,±6.89,0.479975,0.0397225,"[[13, 68], [47, 118]]"
3,0,Sex_HH_F,respondents in Abidjan,33.333333,±10.27,21.5,±5.69,1.825581,0.04747818,"[[27, 54], [43, 157]]"
4,0,WCoutprem,respondents in Nairobi,71.384615,±4.91,3.6,±1.33,66.800478,2.2163119999999998e-124,"[[232, 93], [27, 723]]"
5,0,WCshared,respondents in Nairobi,25.121714,±2.65,0.0,±0.0,inf,0.02919612,"[[258, 769], [0, 15]]"
6,0,SecEduHH,respondents in Nairobi,17.269076,±3.32,30.932203,±4.17,0.466086,6.217937e-07,"[[86, 412], [146, 326]]"
7,0,Sex_HH_F,respondents in Nairobi,30.820399,±4.26,19.230769,±3.09,1.871154,1.393516e-05,"[[139, 312], [120, 504]]"


### 3.1.2. Confirm selection of independent variables: test for multicollinearity with Variance Inflation Factor
Note: as a thumbrule, variables with a VIF > 5 shall be discarded 

In [9]:
# Set list of selected exposure + control variables
exposure_lst = list_gen.copy()

# Dummy dataframes: keep only observations with valid answers for all selected variables
data_abidjan = df_abi[['WCunsafe']+exposure_lst].copy().dropna()
data_nairobi = df_nai[['WCunsafe']+exposure_lst].copy().dropna()
data_strings = ['Abidjan','Nairobi']

# Create table for Variance Inflation Factor (VIF)
vif_scores = pd.DataFrame() 
vif_scores["Attribute"] = exposure_lst

for idx,dataset in enumerate([data_abidjan,data_nairobi]):
    # Calculate VIF for each feature
    vif_var = "VIF Scores "+data_strings[idx]
    vif_scores[vif_var] = [variance_inflation_factor(dataset[exposure_lst].values, i) for i in range(len(dataset[exposure_lst].columns))] 
    # Set VIF threshold
    vif_t = 5
    if vif_scores[vif_var].max() < vif_t:
        print("All good: no significant multicollinearity",
              " in ",data_strings[idx])
    else:
        print(">> WARNING: multicollinearity detected for:",
              vif_scores["Attribute"][vif_scores[vif_var]>vif_t].to_list(),
              " in ",data_strings[idx])

# View results
display(vif_scores)

All good: no significant multicollinearity  in  Abidjan
All good: no significant multicollinearity  in  Nairobi


Unnamed: 0,Attribute,VIF Scores Abidjan,VIF Scores Nairobi
0,WCoutprem,1.078429,1.52399
1,WCshared,1.667076,3.59685
2,SecEduHH,1.323111,2.182574
3,Sex_HH_F,1.23658,1.698681


### 3.1.3. Adjusted odds ratios : multiple logistic regression with selected covariates

In [10]:
# Set list of selected exposure variables
outcome = 'WCunsafe'
exposure_lst = list_gen.copy()

# Data
data_reg_abi = df_abi[[outcome]+exposure_lst].copy().dropna()
data_reg_nai = df_nai[[outcome]+exposure_lst].copy().dropna()
data_strata = ['Respondents in Abidjan ( n = '+str(data_reg_abi.shape[0])+' )',
               'Respondents in Nairobi ( n = '+str(data_reg_nai.shape[0])+' )']

# Create table for ORs (initially empty)
or_strcty = pd.DataFrame()

# Discard exposure variable if all observations are exposed, or non-exposed (OR not plausible)
drop_list = []
print('==============================================================================')
print('                         CHECKING EXPOSURE VARIABLES:')
for data_reg in [data_reg_abi,data_reg_nai]:
    print('')
    for var in exposure_lst:
        if len(data_reg[var].value_counts().values)==1:
            drop_list = drop_list+[var]
            print('>> WARNING : dropped',var,'( only 0s or 1s in ',data_strata[idx],')')
if len(drop_list)==0:
    print('>> All good!')
elif len(drop_list)>0:
    exposure_lst = sorted(list(set(exposure_lst)-set(drop_list))) # update exposure variables list if needed
print('==============================================================================')

# Calculate stratified, adjusted ORs
for idx,data_reg in enumerate([data_reg_abi,data_reg_nai]):
    # Fit a logistic regression model with all the variables
    logit = sm.Logit(data_reg[outcome], sm.add_constant(data_reg[exposure_lst]))
    result = logit.fit()

    # Print the summary of the model
    print('==============================================================================')
    print('                       ',data_strata[idx])
    print(result.summary())

    # Calculate the odds ratios and confidence intervals
    params = result.params
    conf = result.conf_int()
    signif = result.pvalues
    modelpval = result.llr_pvalue
    odds_ratios = np.exp(params)
    conf_lower = np.exp(conf[0])
    conf_upper = np.exp(conf[1])

    # Print the odds ratios and confidence intervals
    or_strat = pd.DataFrame({'OR': odds_ratios,
                             'signif': signif,
                             'Lower CI': conf_lower,
                             'Upper CI': conf_upper}).reset_index()
    or_strat['Significance'] = ''
    if modelpval < 0.05:
        or_strat['Significance'][or_strat['signif']>=0.1] = 'Not significant'
        or_strat['Significance'][or_strat['signif']<0.1] = '*'
        or_strat['Significance'][or_strat['signif']<0.05] = '**'
        or_strat['Significance'][or_strat['signif']<0.01] = '***'
        or_strat['Significance'][or_strat['signif']<0.001] = '****'
    else:
        or_strat['Significance'] = 'Not significant (LLR p-value > 5%)'
    or_strat = or_strat.drop(['signif'],axis=1)
    multi_index = pd.MultiIndex.from_tuples([(data_strata[idx],'Exposure'),
                                             (data_strata[idx],'Adjusted OR'),
                                             (data_strata[idx],'Lower CI (95%)'),
                                             (data_strata[idx],'Upper CI (95%)'),
                                             (data_strata[idx],'Significance')])
    or_strat.columns = multi_index
    or_strat = or_strat.iloc[1:]
    or_strcty = pd.concat([or_strcty,or_strat],axis=1)

# See final OR table
print('==============================================================================')
print('                     Stratified, adjusted odds ratios :')
or_strcty = or_strcty.replace({'ImprvSan':'Access to improved sanitation facility',
                               'SecEduHH':'Head of household with secondary education',
                               'WCoutprem':'Toilet located out of premises',
                               'WCshared':'Toilet shared by more than one household',
                               'Sex_HH_F':'Female respondent'
                              })

# Export adjusted OR table
or_strcty.to_csv(path.join(root_dir,
                           "data/outputs/odds_ratios/df_OR_WCunsafe_adj.csv"))

# View result
or_strcty

                         CHECKING EXPOSURE VARIABLES:


>> All good!
Optimization terminated successfully.
         Current function value: 0.528443
         Iterations 6
                        Respondents in Abidjan ( n = 245 )
                           Logit Regression Results                           
Dep. Variable:               WCunsafe   No. Observations:                  245
Model:                          Logit   Df Residuals:                      240
Method:                           MLE   Df Model:                            4
Date:                Mon, 20 Feb 2023   Pseudo R-squ.:                 0.04271
Time:                        12:09:21   Log-Likelihood:                -129.47
converged:                       True   LL-Null:                       -135.24
Covariance Type:            nonrobust   LLR p-value:                   0.02101
                 coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------

Unnamed: 0_level_0,Respondents in Abidjan ( n = 245 ),Respondents in Abidjan ( n = 245 ),Respondents in Abidjan ( n = 245 ),Respondents in Abidjan ( n = 245 ),Respondents in Abidjan ( n = 245 ),Respondents in Nairobi ( n = 948 ),Respondents in Nairobi ( n = 948 ),Respondents in Nairobi ( n = 948 ),Respondents in Nairobi ( n = 948 ),Respondents in Nairobi ( n = 948 )
Unnamed: 0_level_1,Exposure,Adjusted OR,Lower CI (95%),Upper CI (95%),Significance,Exposure,Adjusted OR,Lower CI (95%),Upper CI (95%),Significance
1,Toilet located out of premises,3.135062,1.129151,8.704426,**,Toilet located out of premises,57.97187,35.931942,93.530652,****
2,Toilet shared by more than one household,1.580535,0.605188,4.127796,Not significant,Toilet shared by more than one household,50757740.0,0.0,inf,Not significant
3,Head of household with secondary education,0.540751,0.267719,1.092233,*,Head of household with secondary education,0.8101583,0.516902,1.26979,Not significant
4,Female respondent,1.615617,0.827024,3.156156,Not significant,Female respondent,1.25284,0.800516,1.960746,Not significant


## 3.2. Lack of hygiene in toilets

### 3.2.1. Bivariate odds ratio analysis to identify candidate explanatory variables

In [11]:
## Set list of exposure variables

# List
exposure_lst = ['WCoutprem', # most used toilet is located in a 'public' place (out of premises)
                'WCshared', # most used toilet is shared by more than onoe household
                'ImprvSan' # most used toilet is considered 'improved'
               ]

# Subsets list 1: general population
subsets_gen = [df_abi,df_nai]
subsets_gen_str = ['respondents in Abidjan','respondents in Nairobi']

In [12]:
## Perceived lack of hygiene in toilets

# Calculate odds ratios
outcome_var = "WCdirtyS"
outcome_pos = 1
outcome_neg = 0
df_oddsr_gen = pd.DataFrame()
for idx, subset in enumerate(subsets_gen):
    print("------------------ ",subsets_gen_str[idx]," ------------------")
    for exposure in exposure_lst:
        # define groups
        print("Variable:",exposure)
        exposure_grp = subset[subset[exposure]==1]
        no_exposure_grp = subset[subset[exposure]==0]
        exposure_grp = exposure_grp[~exposure_grp[outcome_var].isna()]
        no_exposure_grp = no_exposure_grp[~no_exposure_grp[outcome_var].isna()]
        # set table for Fisher tests
        table = np.array([[exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0], exposure_grp[exposure_grp[outcome_var]==outcome_neg].shape[0]],
                          [no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0], no_exposure_grp[no_exposure_grp[outcome_var]==outcome_neg].shape[0]]])
        # calculate proportion of households with at least 1 case
        exposure_grp_prop = exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0]/exposure_grp.shape[0]
        no_exposure_grp_prop = no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0]/no_exposure_grp.shape[0]
        # calculate 95% CI - exposure group
        P_exp = exposure_grp_prop
        N_exp = exposure_grp.shape[0]
        CI_exp = 1.96*(np.sqrt((P_exp*(1-P_exp))/N_exp))
        # calculate 95% CI - no exposure group
        P_ne = no_exposure_grp_prop
        N_ne = no_exposure_grp.shape[0]
        CI_ne = 1.96*(np.sqrt((P_ne*(1-P_ne))/N_ne))
        # run Fisher tests for OR = 1
        oddsratio_eq1, pvalue_eq1 = stats.fisher_exact(table)
        # run Fisher tests for OR > 1
        oddsratio_greater1, pvalue_greater1 = stats.fisher_exact(table,alternative="greater")
        # run Fisher tests for OR < 1
        oddsratio_less1, pvalue_less1 = stats.fisher_exact(table,alternative="less")
        # add results to dataframe
        df_oddsr_gen = df_oddsr_gen.append([[exposure,
                                             subsets_gen_str[idx],
                                             exposure_grp_prop*100,
                                             '±'+str(round(CI_exp*100,2)),
                                             no_exposure_grp_prop*100,
                                             '±'+str(round(CI_ne*100,2)),
                                             oddsratio_eq1,
                                             pvalue_eq1,
                                             pvalue_greater1,
                                             pvalue_less1,
                                             table
                                            ]])

# Reset columns' names & index
df_oddsr_gen.columns = ['exposure variable','stratum',
                        '% outcomes, exposed','95% CI, exposed',
                        '% outcomes, non-exposed','95% CI, non-exposed',
                        'OR','p-value for OR=1',
                        'p_OR_hi_1','p_OR_lo_1',
                        'table'
                       ]
df_oddsr_gen = df_oddsr_gen.reset_index()

------------------  respondents in Abidjan  ------------------
Variable: WCoutprem
Variable: WCshared
Variable: ImprvSan
------------------  respondents in Nairobi  ------------------
Variable: WCoutprem
Variable: WCshared
Variable: ImprvSan


In [13]:
## List of significant covariates

# Significance threshold (maximum p-value)
st = 0.05

# General population
list_gen = list(set(df_oddsr_gen['exposure variable'][df_oddsr_gen['p-value for OR=1']<st].to_list()))

# Dataframe containing only selected covariates, by age group
df_raw_or = df_oddsr_gen[df_oddsr_gen['exposure variable'].isin(list_gen)].drop(['p_OR_hi_1', 'p_OR_lo_1'],axis=1)

# Check results
print("Selected covariates (n=",len(list_gen),"):",list_gen)
df_raw_or

Selected covariates (n= 3 ): ['WCoutprem', 'WCshared', 'ImprvSan']


Unnamed: 0,index,exposure variable,stratum,"% outcomes, exposed","95% CI, exposed","% outcomes, non-exposed","95% CI, non-exposed",OR,p-value for OR=1,table
0,0,WCoutprem,respondents in Abidjan,47.058824,±23.73,32.69962,±5.67,1.829457,0.2888321,"[[8, 9], [86, 177]]"
1,0,WCshared,respondents in Abidjan,35.983264,±6.08,17.5,±11.78,2.64986,0.02854116,"[[86, 153], [7, 33]]"
2,0,ImprvSan,respondents in Abidjan,32.631579,±9.43,34.054054,±6.83,0.937996,0.8938375,"[[31, 64], [63, 122]]"
3,0,WCoutprem,respondents in Nairobi,4.012346,±2.14,20.133333,±2.87,0.165818,1.655086e-13,"[[13, 311], [151, 599]]"
4,0,WCshared,respondents in Nairobi,15.789474,±2.23,6.666667,±12.62,2.625,0.4889901,"[[162, 864], [1, 14]]"
5,0,ImprvSan,respondents in Nairobi,17.150761,±2.75,11.396011,±3.32,1.609516,0.01451804,"[[124, 599], [40, 311]]"


### 3.2.2. Adjusted odds ratios : multiple logistic regression with selected covariates

In [14]:
# Set list of selected exposure variables
outcome = 'WCdirtyS'
exposure_lst = list_gen.copy()

# Data
data_reg_abi = df_abi[[outcome]+exposure_lst].copy().dropna()
data_reg_nai = df_nai[[outcome]+exposure_lst].copy().dropna()
data_strata = ['Respondents in Abidjan ( n = '+str(data_reg_abi.shape[0])+' )',
               'Respondents in Nairobi ( n = '+str(data_reg_nai.shape[0])+' )']

# Create table for ORs (initially empty)
or_strcty = pd.DataFrame()

# Discard exposure variable if all observations are exposed, or non-exposed (OR not plausible)
drop_list = []
print('==============================================================================')
print('                         CHECKING EXPOSURE VARIABLES:')
for data_reg in [data_reg_abi,data_reg_nai]:
    print('')
    for var in exposure_lst:
        if len(data_reg[var].value_counts().values)==1:
            drop_list = drop_list+[var]
            print('>> WARNING : dropped',var,'( only 0s or 1s in ',data_strata[idx],' )')
if len(drop_list)==0:
    print('>> All good!')
elif len(drop_list)>0:
    exposure_lst = sorted(list(set(exposure_lst)-set(drop_list))) # update exposure variables list if needed
print('==============================================================================')

# Calculate stratified, adjusted ORs
for idx,data_reg in enumerate([data_reg_abi,data_reg_nai]):
    # Fit a logistic regression model with all the variables
    logit = sm.Logit(data_reg[outcome], sm.add_constant(data_reg[exposure_lst]))
    result = logit.fit()

    # Print the summary of the model
    print('==============================================================================')
    print('                       ',data_strata[idx])
    print(result.summary())

    # Calculate the odds ratios and confidence intervals
    params = result.params
    conf = result.conf_int()
    signif = result.pvalues
    modelpval = result.llr_pvalue
    odds_ratios = np.exp(params)
    conf_lower = np.exp(conf[0])
    conf_upper = np.exp(conf[1])

    # Print the odds ratios and confidence intervals
    or_strat = pd.DataFrame({'OR': odds_ratios,
                             'signif': signif,
                             'Lower CI': conf_lower,
                             'Upper CI': conf_upper}).reset_index()
    or_strat['Significance'] = ''
    if modelpval < 0.05:
        or_strat['Significance'][or_strat['signif']>=0.1] = 'Not significant'
        or_strat['Significance'][or_strat['signif']<0.1] = '*'
        or_strat['Significance'][or_strat['signif']<0.05] = '**'
        or_strat['Significance'][or_strat['signif']<0.01] = '***'
        or_strat['Significance'][or_strat['signif']<0.001] = '****'
    else:
        or_strat['Significance'] = 'Not significant (LLR p-value > 5%)'
    or_strat = or_strat.drop(['signif'],axis=1)
    multi_index = pd.MultiIndex.from_tuples([(data_strata[idx],'Exposure'),
                                             (data_strata[idx],'Adjusted OR'),
                                             (data_strata[idx],'Lower CI (95%)'),
                                             (data_strata[idx],'Upper CI (95%)'),
                                             (data_strata[idx],'Significance')])
    or_strat.columns = multi_index
    or_strat = or_strat.iloc[1:]
    or_strcty = pd.concat([or_strcty,or_strat],axis=1)

# See final OR table
print('==============================================================================')
print('                     Stratified, adjusted odds ratios :')
or_strcty = or_strcty.replace({'ImprvSan':'Access to improved sanitation facility',
                               'WCoutprem':'Toilet located out of premises',
                               'WCshared':'Toilet shared by more than one household',
                              })
# Export adjusted OR table
or_strcty.to_csv(path.join(root_dir,
                           "data/outputs/odds_ratios/df_OR_WCdirty_adj.csv"))

# View result
or_strcty

                         CHECKING EXPOSURE VARIABLES:


>> All good!
Optimization terminated successfully.
         Current function value: 0.623886
         Iterations 5
                        Respondents in Abidjan ( n = 279 )
                           Logit Regression Results                           
Dep. Variable:               WCdirtyS   No. Observations:                  279
Model:                          Logit   Df Residuals:                      275
Method:                           MLE   Df Model:                            3
Date:                Mon, 20 Feb 2023   Pseudo R-squ.:                 0.01984
Time:                        12:09:22   Log-Likelihood:                -174.06
converged:                       True   LL-Null:                       -177.59
Covariance Type:            nonrobust   LLR p-value:                   0.07043
                 coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------

Unnamed: 0_level_0,Respondents in Abidjan ( n = 279 ),Respondents in Abidjan ( n = 279 ),Respondents in Abidjan ( n = 279 ),Respondents in Abidjan ( n = 279 ),Respondents in Abidjan ( n = 279 ),Respondents in Nairobi ( n = 1041 ),Respondents in Nairobi ( n = 1041 ),Respondents in Nairobi ( n = 1041 ),Respondents in Nairobi ( n = 1041 ),Respondents in Nairobi ( n = 1041 )
Unnamed: 0_level_1,Exposure,Adjusted OR,Lower CI (95%),Upper CI (95%),Significance,Exposure,Adjusted OR,Lower CI (95%),Upper CI (95%),Significance
1,Toilet located out of premises,1.75438,0.647071,4.756584,Not significant (LLR p-value > 5%),Toilet located out of premises,0.152845,0.08377,0.278877,****
2,Toilet shared by more than one household,2.647672,1.110242,6.314085,Not significant (LLR p-value > 5%),Toilet shared by more than one household,3.776945,0.49255,28.962141,Not significant
3,Access to improved sanitation facility,1.095273,0.636575,1.884497,Not significant (LLR p-value > 5%),Access to improved sanitation facility,0.930523,0.616338,1.404868,Not significant


## 3.3. Use of a toilet out of premises by households with children

In [15]:
## Set list of exposure variables

# List
exposure_lst = ['Child_U5' # household with at least one child under five years
               ]

# Subsets list 1: general population
subsets_gen = [df_abi,df_nai]
subsets_gen_str = ['respondents in Abidjan','respondents in Nairobi']

In [16]:
# Calculate odds ratios
outcome_var = "WCoutprem"
outcome_pos = 1
outcome_neg = 0
df_oddsr_gen = pd.DataFrame()
for idx, subset in enumerate(subsets_gen):
    print("------------------ ",subsets_gen_str[idx]," ------------------")
    for exposure in exposure_lst:
        # define groups
        print("Variable:",exposure)
        exposure_grp = subset[subset[exposure]==1]
        no_exposure_grp = subset[subset[exposure]==0]
        exposure_grp = exposure_grp[~exposure_grp[outcome_var].isna()]
        no_exposure_grp = no_exposure_grp[~no_exposure_grp[outcome_var].isna()]
        # set table for Fisher tests
        table = np.array([[exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0], exposure_grp[exposure_grp[outcome_var]==outcome_neg].shape[0]],
                          [no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0], no_exposure_grp[no_exposure_grp[outcome_var]==outcome_neg].shape[0]]])
        # calculate proportion of households with at least 1 case
        exposure_grp_prop = exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0]/exposure_grp.shape[0]
        no_exposure_grp_prop = no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0]/no_exposure_grp.shape[0]
        # calculate 95% CI - exposure group
        P_exp = exposure_grp_prop
        N_exp = exposure_grp.shape[0]
        CI_exp = 1.96*(np.sqrt((P_exp*(1-P_exp))/N_exp))
        # calculate 95% CI - no exposure group
        P_ne = no_exposure_grp_prop
        N_ne = no_exposure_grp.shape[0]
        CI_ne = 1.96*(np.sqrt((P_ne*(1-P_ne))/N_ne))
        # run Fisher tests for OR = 1
        oddsratio_eq1, pvalue_eq1 = stats.fisher_exact(table)
        # run Fisher tests for OR > 1
        oddsratio_greater1, pvalue_greater1 = stats.fisher_exact(table,alternative="greater")
        # run Fisher tests for OR < 1
        oddsratio_less1, pvalue_less1 = stats.fisher_exact(table,alternative="less")
        # add results to dataframe
        df_oddsr_gen = df_oddsr_gen.append([[exposure,
                                             subsets_gen_str[idx],
                                             exposure_grp_prop*100,
                                             '±'+str(round(CI_exp*100,2)),
                                             no_exposure_grp_prop*100,
                                             '±'+str(round(CI_ne*100,2)),
                                             oddsratio_eq1,
                                             pvalue_eq1,
                                             pvalue_greater1,
                                             pvalue_less1,
                                             table
                                            ]])

# Reset columns' names & index
df_oddsr_gen.columns = ['exposure variable','stratum',
                        '% outcomes, exposed','95% CI, exposed',
                        '% outcomes, non-exposed','95% CI, non-exposed',
                        'OR','p-value for OR=1',
                        'p_OR_hi_1','p_OR_lo_1',
                        'table'
                       ]
df_oddsr_gen = df_oddsr_gen.reset_index()

df_oddsr_gen

------------------  respondents in Abidjan  ------------------
Variable: Child_U5
------------------  respondents in Nairobi  ------------------
Variable: Child_U5


Unnamed: 0,index,exposure variable,stratum,"% outcomes, exposed","95% CI, exposed","% outcomes, non-exposed","95% CI, non-exposed",OR,p-value for OR=1,p_OR_hi_1,p_OR_lo_1,table
0,0,Child_U5,respondents in Abidjan,3.404255,±2.32,2.710843,±1.75,1.264807,0.627634,0.405567,0.767833,"[[8, 227], [9, 323]]"
1,0,Child_U5,respondents in Nairobi,25.728155,±4.22,30.165289,±3.34,0.801952,0.116639,0.95216,0.063223,"[[106, 306], [219, 507]]"


# 4. Notes

### Location of toilets seems crucial to ensure safety
Toilet located out of premises was the only variable consistently and significantly associated with a perceived lack of safety across cities, and was much more significant than sharing.

### Hygiene status not consistent across sites
The logistic model with selected covariates was not significant in Abidjan, but was significant in Nairobi. There, toilets located out of premises were associated with better hygiene. This is certainly related to paid sanitation services (that may have more maintenance capacity) offered by different NGOs and/or community-based organisations, notably in the study site located in Mathare Valley.  
We note that, although public toilets tend to be cleaner in Nairobi, this advatage is partially compromised by the lack of safety to access these facilities.