In [1]:
import os
from pathlib import Path
from os import path

import numpy as np
import pandas as pd
import scipy.stats as stats

from functools import reduce

# 1. Input data

In [2]:
cwd = os.getcwd()
root_dir = Path(cwd).parent

In [3]:
# SET INPUTs & DIRECTORIES

# Individual data
data_IND_tot = pd.read_csv(path.join(root_dir,
                                    "data/individual_data.csv"))

# Household data
data_HH_tot = pd.read_csv(path.join(root_dir,
                                    "data/household_data.csv"))

# 2. Preprocess data

## 2.1. Create variable indicating whether households had at least 1 case of diarrhea
Obs.: distinguish cases in general population & in children under-5; recall period was the 2 weeks preceding the survey

In [4]:
# GET CASES OF DIARRHOEA

# Input
dst_data = data_HH_tot.copy()
src_data = data_IND_tot.copy()
# Source data: get only variables of interest
src_diarr_all = pd.DataFrame([src_data["PARENT_KEY"],src_data["Age"],src_data["Diarrhoea"]]).transpose()
src_diarr_all = src_diarr_all.rename(columns={"PARENT_KEY": "KEY"})# Better if joining column has same name ("KEY") in both DFs
# New column: indicate presence of at least one child under-5
dst_data["Child_U5"] = ""
lst_key_U5 = list(src_diarr_all["KEY"][src_diarr_all["Age"]<5])
# Filter source data: only individuals with diarrhoea
src_diarr_all = src_diarr_all[src_diarr_all["Diarrhoea"]=="Y"]
# New columns: indicate presence of at least one case of diarrhoea
# create columns
dst_data["D_Case_Tt"] = "" # any case
lst_key_total_cases = list(src_diarr_all["KEY"])
dst_data["D_Case_U5"] = "" # under-5
lst_key_U5_cases = list(src_diarr_all["KEY"][src_diarr_all["Age"]<5])
# set values
dst_data["D_Case_Tt"][dst_data["KEY"].isin(lst_key_total_cases)] = "Y"
dst_data["D_Case_Tt"][dst_data["D_Case_Tt"]!="Y"] = "N"
dst_data["D_Case_U5"][dst_data["KEY"].isin(lst_key_U5_cases)] = "Y"
dst_data["D_Case_U5"][dst_data["D_Case_U5"]!="Y"] = "N"
dst_data["Child_U5"][dst_data["KEY"].isin(lst_key_U5)] = "Y"
dst_data["Child_U5"][dst_data["Child_U5"]!="Y"] = "N"

dst_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dst_data["D_Case_Tt"][dst_data["KEY"].isin(lst_key_total_cases)] = "Y"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dst_data["D_Case_Tt"][dst_data["D_Case_Tt"]!="Y"] = "N"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dst_data["D_Case_U5"][dst_data["KEY"].isin(lst_key_U5_cases)] = "Y"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

Unnamed: 0.1,Unnamed: 0,uuid,KEY,HH_Number,Area_Code,ToiletFacility,ToiletFacilitySAFE,SITE_ID,CITY_ID,dist2wc,...,b_area,n_blg_cc,n_blg_ccM,n_CAR,n_CARM,n_mn_Bdev,n_mn_BdevM,Child_U5,D_Case_Tt,D_Case_U5
0,0,51e3d6c74bba45febd33457e11a9b459,uuid:464423ec-b7a0-4924-b3e4-ce85bb6a5286,252,Az2,Own_Dwelling,,Azito,Abidjan,0.0,...,160.283340,0.476315,0.521259,0.676246,0.517386,2.472029e+00,3.534918,N,N,N
1,1,51e3d6c74bba45febd33457e11a9b459,uuid:b3296fc2-e492-42ae-adb3-7bd1d50ce72e,796,Az1,Own_Dwelling,,Azito,Abidjan,0.0,...,190.270531,0.451753,0.521636,0.911541,0.482165,1.056956e+00,2.252288,N,Y,N
2,2,51e3d6c74bba45febd33457e11a9b459,uuid:ecad555f-212d-4188-9228-bb80e0fcba51,1,Az1,Own_Dwelling,,Azito,Abidjan,0.0,...,339.443519,0.314720,0.497692,0.447746,0.418284,2.823408e+00,3.560253,N,N,N
3,3,51e3d6c74bba45febd33457e11a9b459,uuid:9c38f788-bf34-4a13-a70e-782eea816703,628,Az3,Own_Dwelling,,Azito,Abidjan,0.0,...,173.679705,0.457577,0.504536,0.508321,0.376475,5.128454e-01,1.903155,N,N,N
4,4,51e3d6c74bba45febd33457e11a9b459,uuid:5af42a84-d16f-465c-bd56-8e8a07b2dda1,234,Az2,Own_YardPlot,Any_time,Azito,Abidjan,0.0,...,97.842773,0.489622,0.520548,0.289504,0.399063,1.830034e+00,2.794062,N,N,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1709,1712,4add689d7a314c899ba5137cc65e8265,uuid:a28286dd-149d-45a5-aff6-3cd82a810c76,643,MU6,Own_YardPlot,Any_time,Mukuru,Nairobi,0.0,...,66.586822,0.343347,0.420293,0.908437,0.749193,1.142982e-08,2.666655,Y,Y,Y
1710,1713,4add689d7a314c899ba5137cc65e8265,uuid:d1b2eb4b-5eeb-4981-98dd-ec3eb3dd896f,794,Mu4,Own_YardPlot,Any_time,Mukuru,Nairobi,0.0,...,41.158183,0.513595,0.373743,0.686765,0.761801,2.916848e-01,2.358749,Y,Y,N
1711,1714,4add689d7a314c899ba5137cc65e8265,uuid:5d29bc14-4e64-43f8-9dfa-945ddaae089a,328,MU1,Own_YardPlot,Any_time,Mukuru,Nairobi,0.0,...,96.962981,0.263408,0.364527,0.969127,0.699379,2.028727e-09,1.461015,Y,N,N
1712,1715,4add689d7a314c899ba5137cc65e8265,uuid:007e2f7b-0d59-4de9-aab1-3509c6c4566f,1253,Mu6,Own_YardPlot,Any_time,Mukuru,Nairobi,0.0,...,82.662454,0.578906,0.379600,0.747246,0.707976,1.842102e+00,3.087594,N,N,N


## 2.2. Set city-level subsets and key exposure variables

In [5]:
## Set Subsets

# Households with at least one child under-5
dst_data_U5 = dst_data[dst_data["Child_U5"]=="Y"]

# General population, Nairobi
mask = (dst_data["Area_Code"].str.startswith("M"))|(dst_data["Area_Code"].str.startswith("m"))
dst_data_nairobi = dst_data[mask]
print("N for Nairobi, general pop.:",dst_data_nairobi.shape[0])

# Under 5, Nairobi
mask = (dst_data_U5["Area_Code"].str.startswith("M"))|(dst_data_U5["Area_Code"].str.startswith("m"))
dst_data_U5_nairobi = dst_data_U5[mask]
print("N for Nairobi, under 5:",dst_data_U5_nairobi.shape[0])

# General population, Abidjan
mask = (~dst_data["Area_Code"].str.startswith("M"))&(~dst_data["Area_Code"].str.startswith("m"))
dst_data_abidjan = dst_data[mask]
print("N for Abidjan, general pop.:",dst_data_abidjan.shape[0])

# Under 5, Abidjan
mask = (~dst_data_U5["Area_Code"].str.startswith("M"))&(~dst_data_U5["Area_Code"].str.startswith("m"))
dst_data_U5_abidjan = dst_data_U5[mask]
print("N for Abidjan, under 5:",dst_data_U5_abidjan.shape[0])

if dst_data_nairobi.shape[0]+dst_data_abidjan.shape[0]==dst_data.shape[0]:
    print("subsets good to go!")
else:
    print("check subsets: issue with shape size(s)")

N for Nairobi, general pop.: 1147
N for Nairobi, under 5: 413
N for Abidjan, general pop.: 567
N for Abidjan, under 5: 235
subsets good to go!


In [6]:
## Set exposure variables necessary for analysis: WASH facilities' characteristics + potential confounders

# Recode perceived safety to go to WC
for df in [dst_data,dst_data_nairobi,dst_data_abidjan,
           dst_data_U5,dst_data_U5_nairobi,dst_data_U5_abidjan]:
    wc_unsf_mask = ((df["ToiletFacilitySAFE"]=="During_Day")|(df["ToiletFacilitySAFE"]=="Unsafe"))
    df["WCunsafe"] = np.nan
    df["WCunsafe"][~df["ToiletFacilitySAFE"].isna()] = 0
    df["WCunsafe"][wc_unsf_mask] = 1
    
# Recode location of toilet (within premises X out of premises, excluding open defecation from analysis)
for df in [dst_data,dst_data_nairobi,dst_data_abidjan,
           dst_data_U5,dst_data_U5_nairobi,dst_data_U5_abidjan]:
    # define "public"
    public_sel = ["Public","Neighb_YardPlot","Neighb_Dwelling"] # any WC located OUT OF PREMISES
    wc_loc_mask = (df["ToiletFacility"].isin(public_sel))
    df["WCpublic"] = np.nan
    df["WCpublic"][(~df["ToiletFacility"].isna())&(df["ToiletFacility"]!="NoFacility_Nature")] = 0
    df["WCpublic"][wc_loc_mask] = 1
            

## Potential confounder

# Education level of heads of households
src = data_IND_tot[data_IND_tot.Relation_to_HH=='Head'][['PARENT_KEY',
                                                         'School_past']]# subset ed. level of HH
src = src.rename(columns={"PARENT_KEY": "KEY"})# rename key variable allowing for data join
# Recode education level of heads of households
src['SecEduHH'] = np.nan
src['SecEduHH'][src['School_past'].isin(['No_Edu',
                                         'Early_CdE',
                                         'Primary',
                                         'Coranic'])] = 0 # up to primary education
src['SecEduHH'][src['School_past'].isin(['Secondary',
                                         'Secondary_1',
                                         'Secondary_2',
                                         'High_Ed'])] = 1 # secondary or higher education
src = src[~src.SecEduHH.isna()]
# In case a same household has 2 heads, merge lines
src = src[['SecEduHH','KEY']].groupby(by="KEY").max().reset_index()
# Attribute education level of heads of households
dst_data = dst_data.merge(src,on="KEY",how='left')
dst_data_nairobi = dst_data_nairobi.merge(src,on="KEY",how='left')
dst_data_abidjan = dst_data_abidjan.merge(src,on="KEY",how='left')
dst_data_U5 = dst_data_U5.merge(src,on="KEY",how='left')
dst_data_U5_nairobi = dst_data_U5_nairobi.merge(src,on="KEY",how='left')
dst_data_U5_abidjan = dst_data_U5_abidjan.merge(src,on="KEY",how='left')

# 3. Odds ratios (i): associations between risk of diarrhoea and perceived safety to use toilets

In [7]:
# Descriptive statistics: number of valid observations for odds ratio analysis (households using WC outside dwelling)
print('Households using a toilet somewhere outside the dwelling in Abidjan:')
print(dst_data_nairobi.WCunsafe.isna().value_counts()[0])
print('---------------------------------------------------------------------')
print('Households using a toilet somewhere outside the dwelling in Nairobi:')
print(dst_data_abidjan.WCunsafe.isna().value_counts()[0])

Households using a toilet somewhere outside the dwelling in Abidjan:
1075
---------------------------------------------------------------------
Households using a toilet somewhere outside the dwelling in Nairobi:
281


## 3.1. Calculate odds ratios and Fisher's exact tests, stratified by city and age group

In [8]:
## Set list of exposure variables

# List
exposure_lst = ["WCunsafe"]# perceived safety to go to WC (1 = unsafe)

# Subsets list 1: general population
subsets_gen = [dst_data,dst_data_nairobi,dst_data_abidjan]
subsets_gen_str = ['hhGen_total','hhGen_nairobi','hhGen_abidjan']

# Subsets list 2: households with children under five years old
subsets_cU5 = [dst_data_U5,dst_data_U5_nairobi,dst_data_U5_abidjan]
subsets_cU5_str = ['hhU5_total','hhU5_nairobi','hhU5_abidjan']

In [9]:
## Risk of diarrhoea, general population

# Calculate odds ratios
outcome_var = "D_Case_Tt"
outcome_pos = "Y"
outcome_neg = "N"
df_oddsr_gen = pd.DataFrame()
for idx, subset in enumerate(subsets_gen):
    print("------------------ ",subsets_gen_str[idx]," ------------------")
    for exposure in exposure_lst:
        # define groups
        print("Variable:",exposure)
        exposure_grp = subset[subset[exposure]==1]
        no_exposure_grp = subset[subset[exposure]==0]
        exposure_grp = exposure_grp[~exposure_grp[outcome_var].isna()]
        no_exposure_grp = no_exposure_grp[~no_exposure_grp[outcome_var].isna()]
        # set table for Fisher tests
        table = np.array([[exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0], exposure_grp[exposure_grp[outcome_var]==outcome_neg].shape[0]],
                          [no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0], no_exposure_grp[no_exposure_grp[outcome_var]==outcome_neg].shape[0]]])
        # calculate proportion of households with at least 1 case
        exposure_grp_prop = exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0]/exposure_grp.shape[0]
        no_exposure_grp_prop = no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0]/no_exposure_grp.shape[0]
        # calculate 95% CI - exposure group
        P_exp = exposure_grp_prop
        N_exp = exposure_grp.shape[0]
        CI_exp = 1.96*(np.sqrt((P_exp*(1-P_exp))/N_exp))
        # calculate 95% CI - no exposure group
        P_ne = no_exposure_grp_prop
        N_ne = no_exposure_grp.shape[0]
        CI_ne = 1.96*(np.sqrt((P_ne*(1-P_ne))/N_ne))
        # run Fisher tests for OR = 1
        oddsratio_eq1, pvalue_eq1 = stats.fisher_exact(table)
        # run Fisher tests for OR > 1
        oddsratio_greater1, pvalue_greater1 = stats.fisher_exact(table,alternative="greater")
        # run Fisher tests for OR < 1
        oddsratio_less1, pvalue_less1 = stats.fisher_exact(table,alternative="less")
        # add results to dataframe
        df_oddsr_gen = df_oddsr_gen.append([[exposure,
                                             subsets_gen_str[idx],
                                             exposure_grp_prop*100,
                                             '±'+str(round(CI_exp*100,2)),
                                             no_exposure_grp_prop*100,
                                             '±'+str(round(CI_ne*100,2)),
                                             oddsratio_eq1,
                                             pvalue_eq1,
                                             pvalue_greater1,
                                             pvalue_less1,
                                             table
                                            ]])

# Reset columns' names & index
df_oddsr_gen.columns = ['exp_variable','subset',
                        'prop_exp','CI_pp_exp','prop_no_exp','CI_pp_no_exp',
                        'OR','p_OR_eq_1','p_OR_hi_1','p_OR_lo_1',
                        'table'
                       ]
df_oddsr_gen = df_oddsr_gen.reset_index()

# Check results
df_oddsr_gen

------------------  hhGen_total  ------------------
Variable: WCunsafe
------------------  hhGen_nairobi  ------------------
Variable: WCunsafe
------------------  hhGen_abidjan  ------------------
Variable: WCunsafe


Unnamed: 0,index,exp_variable,subset,prop_exp,CI_pp_exp,prop_no_exp,CI_pp_no_exp,OR,p_OR_eq_1,p_OR_hi_1,p_OR_lo_1,table
0,0,WCunsafe,hhGen_total,39.817629,±5.29,25.70594,±2.67,1.912171,2e-06,1e-06,0.999999,"[[131, 198], [264, 763]]"
1,0,WCunsafe,hhGen_nairobi,36.293436,±5.86,23.161765,±2.89,1.889947,4.8e-05,2.9e-05,0.999985,"[[94, 165], [189, 627]]"
2,0,WCunsafe,hhGen_abidjan,52.857143,±11.69,35.545024,±6.46,2.033131,0.011569,0.008016,0.996394,"[[37, 33], [75, 136]]"


In [10]:
## Risk of diarrhoea, under fives subsets

# Calculate odds ratios
outcome_var = "D_Case_U5"
outcome_pos = "Y"
outcome_neg = "N"
df_oddsr_cU5 = pd.DataFrame()
for idx, subset in enumerate(subsets_cU5):
    print("------------------ ",subsets_cU5_str[idx]," ------------------")
    for exposure in exposure_lst:
        # define groups
        print("Variable:",exposure)
        exposure_grp = subset[subset[exposure]==1]
        no_exposure_grp = subset[subset[exposure]==0]
        exposure_grp = exposure_grp[~exposure_grp[outcome_var].isna()]
        no_exposure_grp = no_exposure_grp[~no_exposure_grp[outcome_var].isna()]
        # set table for Fisher tests
        table = np.array([[exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0], exposure_grp[exposure_grp[outcome_var]==outcome_neg].shape[0]],
                          [no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0], no_exposure_grp[no_exposure_grp[outcome_var]==outcome_neg].shape[0]]])
        # calculate proportion of households with at least 1 case
        exposure_grp_prop = exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0]/exposure_grp.shape[0]
        no_exposure_grp_prop = no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0]/no_exposure_grp.shape[0]
        # calculate 95% CI - exposure group
        P_exp = exposure_grp_prop
        N_exp = exposure_grp.shape[0]
        CI_exp = 1.96*(np.sqrt((P_exp*(1-P_exp))/N_exp))
        # calculate 95% CI - no exposure group
        P_ne = no_exposure_grp_prop
        N_ne = no_exposure_grp.shape[0]
        CI_ne = 1.96*(np.sqrt((P_ne*(1-P_ne))/N_ne))
        # run Fisher tests for OR = 1
        oddsratio_eq1, pvalue_eq1 = stats.fisher_exact(table)
        # run Fisher tests for OR > 1
        oddsratio_greater1, pvalue_greater1 = stats.fisher_exact(table,alternative="greater")
        # run Fisher tests for OR < 1
        oddsratio_less1, pvalue_less1 = stats.fisher_exact(table,alternative="less")
        # add results to dataframe
        df_oddsr_cU5 = df_oddsr_cU5.append([[exposure,
                                             subsets_cU5_str[idx],
                                             exposure_grp_prop*100,
                                             '±'+str(round(CI_exp*100,2)),
                                             no_exposure_grp_prop*100,
                                             '±'+str(round(CI_ne*100,2)),
                                             oddsratio_eq1,
                                             pvalue_eq1,
                                             pvalue_greater1,
                                             pvalue_less1,
                                             table
                                            ]])

# Reset columns' names
df_oddsr_cU5.columns = ['exp_variable','subset',
                        'prop_exp','CI_pp_exp','prop_no_exp','CI_pp_no_exp',
                        'OR','p_OR_eq_1','p_OR_hi_1','p_OR_lo_1',
                        'table'
                       ]
df_oddsr_cU5 = df_oddsr_cU5.reset_index()

# Check results
df_oddsr_cU5

------------------  hhU5_total  ------------------
Variable: WCunsafe
------------------  hhU5_nairobi  ------------------
Variable: WCunsafe
------------------  hhU5_abidjan  ------------------
Variable: WCunsafe


Unnamed: 0,index,exp_variable,subset,prop_exp,CI_pp_exp,prop_no_exp,CI_pp_no_exp,OR,p_OR_eq_1,p_OR_hi_1,p_OR_lo_1,table
0,0,WCunsafe,hhU5_total,33.333333,±8.77,26.020408,±4.34,1.421569,0.148911,0.081898,0.948097,"[[37, 74], [102, 290]]"
1,0,WCunsafe,hhU5_nairobi,32.5,±10.26,26.045016,±4.88,1.36717,0.262186,0.155223,0.901188,"[[26, 54], [81, 230]]"
2,0,WCunsafe,hhU5_abidjan,35.483871,±16.84,25.925926,±9.54,1.571429,0.35395,0.21954,0.890582,"[[11, 20], [21, 60]]"


## 3.2. Stratified analysis : group by education

In [11]:
## Stratify subsets - general population

# Group by confounding variable
c_var = 'SecEduHH'
hied_dst_data = dst_data[dst_data[c_var]==1] # high education level, aggregated dataset
loed_dst_data = dst_data[dst_data[c_var]==0] # low education level, aggregated dataset
hied_dst_data_nairobi = dst_data_nairobi[dst_data_nairobi[c_var]==1] # high education level, Nairobi subset
loed_dst_data_nairobi = dst_data_nairobi[dst_data_nairobi[c_var]==0] # low education level, Nairobi subset
hied_dst_data_abidjan = dst_data_abidjan[dst_data_abidjan[c_var]==1] # high education level, Abidjan subset
loed_dst_data_abidjan = dst_data_abidjan[dst_data_abidjan[c_var]==0] # low education level, Abidjan subset

# Update variables list
exposure_lst_str = list(set(exposure_lst)-set([c_var]))

# Update subsets list
subsets_gen = [hied_dst_data,loed_dst_data,
               hied_dst_data_nairobi,loed_dst_data_nairobi,
               hied_dst_data_abidjan,loed_dst_data_abidjan
              ]
subsets_gen_str = ['hied_hhGen_total','loed_hhGen_total',
                   'hied_hhGen_nairobi','loed_hhGen_nairobi',
                   'hied_hhGen_abidjan','loed_hhGen_abidjan'
                  ]

## Risk of diarrhoea, general population

# Calculate odds ratios
outcome_var = "D_Case_Tt"
outcome_pos = "Y"
outcome_neg = "N"
df_oddsr_gen_str = pd.DataFrame()
for idx, subset in enumerate(subsets_gen):
    print("------------------ ",subsets_gen_str[idx]," ------------------")
    for exposure in exposure_lst:
        # define groups
        print("Variable:",exposure)
        exposure_grp = subset[subset[exposure]==1]
        no_exposure_grp = subset[subset[exposure]==0]
        exposure_grp = exposure_grp[~exposure_grp[outcome_var].isna()]
        no_exposure_grp = no_exposure_grp[~no_exposure_grp[outcome_var].isna()]
        # set table for Fisher tests
        table = np.array([[exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0], exposure_grp[exposure_grp[outcome_var]==outcome_neg].shape[0]],
                          [no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0], no_exposure_grp[no_exposure_grp[outcome_var]==outcome_neg].shape[0]]])
        # calculate proportion of households with at least 1 case
        exposure_grp_prop = exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0]/exposure_grp.shape[0]
        no_exposure_grp_prop = no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0]/no_exposure_grp.shape[0]
        # calculate 95% CI - exposure group
        P_exp = exposure_grp_prop
        N_exp = exposure_grp.shape[0]
        CI_exp = 1.96*(np.sqrt((P_exp*(1-P_exp))/N_exp))
        # calculate 95% CI - no exposure group
        P_ne = no_exposure_grp_prop
        N_ne = no_exposure_grp.shape[0]
        CI_ne = 1.96*(np.sqrt((P_ne*(1-P_ne))/N_ne))
        # run Fisher tests for OR = 1
        oddsratio_eq1, pvalue_eq1 = stats.fisher_exact(table)
        # run Fisher tests for OR > 1
        oddsratio_greater1, pvalue_greater1 = stats.fisher_exact(table,alternative="greater")
        # run Fisher tests for OR < 1
        oddsratio_less1, pvalue_less1 = stats.fisher_exact(table,alternative="less")
        # add results to dataframe
        df_oddsr_gen_str = df_oddsr_gen_str.append([[exposure,
                                                     subsets_gen_str[idx],
                                                     exposure_grp_prop*100,
                                                     '±'+str(round(CI_exp*100,2)),
                                                     no_exposure_grp_prop*100,
                                                     '±'+str(round(CI_ne*100,2)),
                                                     oddsratio_eq1,
                                                     pvalue_eq1,
                                                     pvalue_greater1,
                                                     pvalue_less1,
                                                     table
                                                    ]])

# Reset columns' names & index
df_oddsr_gen_str.columns = ['exp_variable','subset',
                            'prop_exp','CI_pp_exp','prop_no_exp','CI_pp_no_exp',
                            'OR','p_OR_eq_1','p_OR_hi_1','p_OR_lo_1',
                            'table'
                           ]
df_oddsr_gen_str = df_oddsr_gen_str.reset_index()

# Check results
df_oddsr_gen_str

------------------  hied_hhGen_total  ------------------
Variable: WCunsafe
------------------  loed_hhGen_total  ------------------
Variable: WCunsafe
------------------  hied_hhGen_nairobi  ------------------
Variable: WCunsafe
------------------  loed_hhGen_nairobi  ------------------
Variable: WCunsafe
------------------  hied_hhGen_abidjan  ------------------
Variable: WCunsafe
------------------  loed_hhGen_abidjan  ------------------
Variable: WCunsafe


Unnamed: 0,index,exp_variable,subset,prop_exp,CI_pp_exp,prop_no_exp,CI_pp_no_exp,OR,p_OR_eq_1,p_OR_hi_1,p_OR_lo_1,table
0,0,WCunsafe,hied_hhGen_total,33.333333,±9.29,22.916667,±3.76,1.681818,0.039877,0.021585,0.988266,"[[33, 66], [110, 370]]"
1,0,WCunsafe,loed_hhGen_total,43.005181,±6.98,27.927928,±4.17,1.947214,0.000231,0.000157,0.999923,"[[83, 110], [124, 320]]"
2,0,WCunsafe,hied_hhGen_nairobi,33.72093,±9.99,18.932039,±3.78,2.178587,0.0037,0.002623,0.998889,"[[29, 57], [78, 334]]"
3,0,WCunsafe,loed_hhGen_nairobi,38.356164,±7.89,27.300613,±4.84,1.656929,0.017794,0.011293,0.993653,"[[56, 90], [89, 237]]"
4,0,WCunsafe,hied_hhGen_abidjan,30.769231,±25.09,47.058824,±11.86,0.5,0.366928,0.919333,0.219724,"[[4, 9], [32, 36]]"
5,0,WCunsafe,loed_hhGen_abidjan,57.446809,±14.14,29.661017,±8.24,3.201429,0.001272,0.000901,0.999744,"[[27, 20], [35, 83]]"


In [12]:
## Stratify subsets - under fives

# Group by confounding variable
c_var = 'SecEduHH'
hied_dst_data = dst_data_U5[dst_data_U5[c_var]==1] # high education level, aggregated dataset
loed_dst_data = dst_data_U5[dst_data_U5[c_var]==0] # low education level, aggregated dataset
hied_dst_data_nairobi = dst_data_U5_nairobi[dst_data_U5_nairobi[c_var]==1] # high education level, Nairobi subset
loed_dst_data_nairobi = dst_data_U5_nairobi[dst_data_U5_nairobi[c_var]==0] # low education level, Nairobi subset
hied_dst_data_abidjan = dst_data_U5_abidjan[dst_data_U5_abidjan[c_var]==1] # high education level, Abidjan subset
loed_dst_data_abidjan = dst_data_U5_abidjan[dst_data_U5_abidjan[c_var]==0] # low education level, Abidjan subset

# Update variables list
exposure_lst_str = list(set(exposure_lst)-set([c_var]))

# Update subsets list
subsets_cU5 = [hied_dst_data,loed_dst_data,
               hied_dst_data_nairobi,loed_dst_data_nairobi,
               hied_dst_data_abidjan,loed_dst_data_abidjan
              ]
subsets_cU5_str = ['hied_hhU5_total','loed_hhU5_total',
                   'hied_hhU5_nairobi','loed_hhU5_nairobi',
                   'hied_hhU5_abidjan','loed_hhU5_abidjan'
                  ]

## Risk of diarrhoea, under fives subset

# Calculate odds ratios
outcome_var = "D_Case_U5"
outcome_pos = "Y"
outcome_neg = "N"
df_oddsr_cU5_str = pd.DataFrame()
for idx, subset in enumerate(subsets_cU5):
    print("------------------ ",subsets_cU5_str[idx]," ------------------")
    for exposure in exposure_lst:
        # define groups
        print("Variable:",exposure)
        exposure_grp = subset[subset[exposure]==1]
        no_exposure_grp = subset[subset[exposure]==0]
        exposure_grp = exposure_grp[~exposure_grp[outcome_var].isna()]
        no_exposure_grp = no_exposure_grp[~no_exposure_grp[outcome_var].isna()]
        # set table for Fisher tests
        table = np.array([[exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0], exposure_grp[exposure_grp[outcome_var]==outcome_neg].shape[0]],
                          [no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0], no_exposure_grp[no_exposure_grp[outcome_var]==outcome_neg].shape[0]]])
        # calculate proportion of households with at least 1 case
        exposure_grp_prop = exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0]/exposure_grp.shape[0]
        no_exposure_grp_prop = no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0]/no_exposure_grp.shape[0]
        # calculate 95% CI - exposure group
        P_exp = exposure_grp_prop
        N_exp = exposure_grp.shape[0]
        CI_exp = 1.96*(np.sqrt((P_exp*(1-P_exp))/N_exp))
        # calculate 95% CI - no exposure group
        P_ne = no_exposure_grp_prop
        N_ne = no_exposure_grp.shape[0]
        CI_ne = 1.96*(np.sqrt((P_ne*(1-P_ne))/N_ne))
        # run Fisher tests for OR = 1
        oddsratio_eq1, pvalue_eq1 = stats.fisher_exact(table)
        # run Fisher tests for OR > 1
        oddsratio_greater1, pvalue_greater1 = stats.fisher_exact(table,alternative="greater")
        # run Fisher tests for OR < 1
        oddsratio_less1, pvalue_less1 = stats.fisher_exact(table,alternative="less")
        # add results to dataframe
        df_oddsr_cU5_str = df_oddsr_cU5_str.append([[exposure,
                                                     subsets_cU5_str[idx],
                                                     exposure_grp_prop*100,
                                                     '±'+str(round(CI_exp*100,2)),
                                                     no_exposure_grp_prop*100,
                                                     '±'+str(round(CI_ne*100,2)),
                                                     oddsratio_eq1,
                                                     pvalue_eq1,
                                                     pvalue_greater1,
                                                     pvalue_less1,
                                                     table
                                                    ]])

# Reset columns' names & index
df_oddsr_cU5_str.columns = ['exp_variable','subset',
                            'prop_exp','CI_pp_exp','prop_no_exp','CI_pp_no_exp',
                            'OR','p_OR_eq_1','p_OR_hi_1','p_OR_lo_1',
                            'table'
                           ]
df_oddsr_cU5_str = df_oddsr_cU5_str.reset_index()

# Check results
df_oddsr_cU5_str

------------------  hied_hhU5_total  ------------------
Variable: WCunsafe
------------------  loed_hhU5_total  ------------------
Variable: WCunsafe
------------------  hied_hhU5_nairobi  ------------------
Variable: WCunsafe
------------------  loed_hhU5_nairobi  ------------------
Variable: WCunsafe
------------------  hied_hhU5_abidjan  ------------------
Variable: WCunsafe
------------------  loed_hhU5_abidjan  ------------------
Variable: WCunsafe


Unnamed: 0,index,exp_variable,subset,prop_exp,CI_pp_exp,prop_no_exp,CI_pp_no_exp,OR,p_OR_eq_1,p_OR_hi_1,p_OR_lo_1,table
0,0,WCunsafe,hied_hhU5_total,20.0,±13.25,25.806452,±6.29,0.71875,0.529698,0.826158,0.309829,"[[7, 28], [48, 138]]"
1,0,WCunsafe,loed_hhU5_total,42.622951,±12.41,29.139073,±7.25,1.806494,0.075663,0.043014,0.978968,"[[26, 35], [44, 107]]"
2,0,WCunsafe,hied_hhU5_nairobi,18.75,±13.52,25.0,±6.8,0.692308,0.505252,0.836816,0.30618,"[[6, 26], [39, 117]]"
3,0,WCunsafe,loed_hhU5_nairobi,44.736842,±15.81,30.973451,±8.53,1.804082,0.166566,0.089967,0.957916,"[[17, 21], [35, 78]]"
4,0,WCunsafe,hied_hhU5_abidjan,33.333333,±53.34,30.0,±16.4,1.166667,1.0,0.675403,0.788306,"[[1, 2], [9, 21]]"
5,0,WCunsafe,loed_hhU5_abidjan,39.130435,±19.95,23.684211,±13.52,2.071429,0.251431,0.160484,0.94103,"[[9, 14], [9, 29]]"


# 4. Odds ratios (ii): associations between toilet location and perceived safety to use toilets

In [13]:
## Set list of exposure variables

# List
exposure_lst = ["WCpublic"]


## Subsets list
subsets_gen = [dst_data,dst_data_nairobi,dst_data_abidjan]
subsets_gen_str = ['hhGen_total','hhGen_nairobi','hhGen_abidjan']


## Risk of feeling unsafe to access WC

# Calculate odds ratios
outcome_var = "WCunsafe"
outcome_pos = 1
outcome_neg = 0
df_oddsr_WCunsafe = pd.DataFrame()
for idx, subset in enumerate(subsets_gen):
    print("------------------ ",subsets_gen_str[idx]," ------------------")
    for exposure in exposure_lst:
        # define groups
        print("Variable:",exposure)
        exposure_grp = subset[subset[exposure]==1]
        no_exposure_grp = subset[subset[exposure]==0]
        exposure_grp = exposure_grp[~exposure_grp[outcome_var].isna()]
        no_exposure_grp = no_exposure_grp[~no_exposure_grp[outcome_var].isna()]
        # set table for Fisher tests
        table = np.array([[exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0], exposure_grp[exposure_grp[outcome_var]==outcome_neg].shape[0]],
                          [no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0], no_exposure_grp[no_exposure_grp[outcome_var]==outcome_neg].shape[0]]])
        # calculate proportion of households with at least 1 case
        exposure_grp_prop = exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0]/exposure_grp.shape[0]
        no_exposure_grp_prop = no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0]/no_exposure_grp.shape[0]
        # calculate 95% CI - exposure group
        P_exp = exposure_grp_prop
        N_exp = exposure_grp.shape[0]
        CI_exp = 1.96*(np.sqrt((P_exp*(1-P_exp))/N_exp))
        # calculate 95% CI - no exposure group
        P_ne = no_exposure_grp_prop
        N_ne = no_exposure_grp.shape[0]
        CI_ne = 1.96*(np.sqrt((P_ne*(1-P_ne))/N_ne))
        # run Fisher tests for OR = 1
        oddsratio_eq1, pvalue_eq1 = stats.fisher_exact(table)
        # run Fisher tests for OR > 1
        oddsratio_greater1, pvalue_greater1 = stats.fisher_exact(table,alternative="greater")
        # run Fisher tests for OR < 1
        oddsratio_less1, pvalue_less1 = stats.fisher_exact(table,alternative="less")
        # add results to dataframe
        df_oddsr_WCunsafe = df_oddsr_WCunsafe.append([[exposure,
                                             subsets_gen_str[idx],
                                             exposure_grp_prop*100,
                                             '±'+str(round(CI_exp*100,2)),
                                             no_exposure_grp_prop*100,
                                             '±'+str(round(CI_ne*100,2)),
                                             oddsratio_eq1,
                                             pvalue_eq1,
                                             pvalue_greater1,
                                             pvalue_less1,
                                             table
                                            ]])

# Reset columns' names & index
df_oddsr_WCunsafe.columns = ['exp_variable','subset',
                             'prop_exp','CI_pp_exp','prop_no_exp','CI_pp_no_exp',
                             'OR','p_OR_eq_1','p_OR_hi_1','p_OR_lo_1',
                             'table'
                            ]
df_oddsr_WCunsafe = df_oddsr_WCunsafe.reset_index()

# Check results
df_oddsr_WCunsafe

------------------  hhGen_total  ------------------
Variable: WCpublic
------------------  hhGen_nairobi  ------------------
Variable: WCpublic
------------------  hhGen_abidjan  ------------------
Variable: WCpublic


Unnamed: 0,index,exp_variable,subset,prop_exp,CI_pp_exp,prop_no_exp,CI_pp_no_exp,OR,p_OR_eq_1,p_OR_hi_1,p_OR_lo_1,table
0,0,WCpublic,hhGen_total,70.175439,±4.85,8.77712,±1.74,24.454726,1.2167200000000001e-106,1.2167200000000001e-106,1.0,"[[240, 102], [89, 925]]"
1,0,WCpublic,hhGen_nairobi,71.384615,±4.91,3.6,±1.33,66.800478,2.2163119999999998e-124,2.2163119999999998e-124,1.0,"[[232, 93], [27, 723]]"
2,0,WCpublic,hhGen_abidjan,47.058824,±23.73,23.484848,±5.11,2.896057,0.0411949,0.03471295,0.990184,"[[8, 9], [62, 202]]"


## 4.2. Stratified analysis to account for confounders (education & gender)

### 4.2.1 Stratify by education level

In [14]:
## Stratify subsets

# Group by confounding variable
c_var = 'SecEduHH'
hied_dst_data = dst_data[dst_data[c_var]==1] # high education level, aggregated dataset
loed_dst_data = dst_data[dst_data[c_var]==0] # low education level, aggregated dataset
hied_dst_data_nairobi = dst_data_nairobi[dst_data_nairobi[c_var]==1] # high education level, Nairobi subset
loed_dst_data_nairobi = dst_data_nairobi[dst_data_nairobi[c_var]==0] # low education level, Nairobi subset
hied_dst_data_abidjan = dst_data_abidjan[dst_data_abidjan[c_var]==1] # high education level, Abidjan subset
loed_dst_data_abidjan = dst_data_abidjan[dst_data_abidjan[c_var]==0] # low education level, Abidjan subset

# Subsets list
subsets_gen = [hied_dst_data,loed_dst_data,
               hied_dst_data_nairobi,loed_dst_data_nairobi,
               hied_dst_data_abidjan,loed_dst_data_abidjan
              ]
subsets_gen_str = ['hied_hhGen_total','loed_hhGen_total',
                   'hied_hhGen_nairobi','loed_hhGen_nairobi',
                   'hied_hhGen_abidjan','loed_hhGen_abidjan'
                  ]


## Risk of feeling unsafe to access WC

# Exposure variable
exposure_lst = ["WCpublic"] # whether toilet is OUT of premises

# Calculate odds ratios
outcome_var = "WCunsafe"
outcome_pos = 1
outcome_neg = 0
df_oddsr_WCunsafe = pd.DataFrame()
for idx, subset in enumerate(subsets_gen):
    print("------------------ ",subsets_gen_str[idx]," ------------------")
    for exposure in exposure_lst:
        # define groups
        print("Variable:",exposure)
        exposure_grp = subset[subset[exposure]==1]
        no_exposure_grp = subset[subset[exposure]==0]
        exposure_grp = exposure_grp[~exposure_grp[outcome_var].isna()]
        no_exposure_grp = no_exposure_grp[~no_exposure_grp[outcome_var].isna()]
        # set table for Fisher tests
        table = np.array([[exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0], exposure_grp[exposure_grp[outcome_var]==outcome_neg].shape[0]],
                          [no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0], no_exposure_grp[no_exposure_grp[outcome_var]==outcome_neg].shape[0]]])
        # calculate proportion of households with at least 1 case
        exposure_grp_prop = exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0]/exposure_grp.shape[0]
        no_exposure_grp_prop = no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0]/no_exposure_grp.shape[0]
        # calculate 95% CI - exposure group
        P_exp = exposure_grp_prop
        N_exp = exposure_grp.shape[0]
        CI_exp = 1.96*(np.sqrt((P_exp*(1-P_exp))/N_exp))
        # calculate 95% CI - no exposure group
        P_ne = no_exposure_grp_prop
        N_ne = no_exposure_grp.shape[0]
        CI_ne = 1.96*(np.sqrt((P_ne*(1-P_ne))/N_ne))
        # run Fisher tests for OR = 1
        oddsratio_eq1, pvalue_eq1 = stats.fisher_exact(table)
        # run Fisher tests for OR > 1
        oddsratio_greater1, pvalue_greater1 = stats.fisher_exact(table,alternative="greater")
        # run Fisher tests for OR < 1
        oddsratio_less1, pvalue_less1 = stats.fisher_exact(table,alternative="less")
        # add results to dataframe
        df_oddsr_WCunsafe = df_oddsr_WCunsafe.append([[exposure,
                                             subsets_gen_str[idx],
                                             exposure_grp_prop*100,
                                             '±'+str(round(CI_exp*100,2)),
                                             no_exposure_grp_prop*100,
                                             '±'+str(round(CI_ne*100,2)),
                                             oddsratio_eq1,
                                             pvalue_eq1,
                                             pvalue_greater1,
                                             pvalue_less1,
                                             table
                                            ]])

# Reset columns' names & index
df_oddsr_WCunsafe.columns = ['exp_variable','subset',
                             'prop_exp','CI_pp_exp','prop_no_exp','CI_pp_no_exp',
                             'OR','p_OR_eq_1','p_OR_hi_1','p_OR_lo_1',
                             'table'
                            ]
df_oddsr_WCunsafe = df_oddsr_WCunsafe.reset_index()

# Check results
df_oddsr_WCunsafe

------------------  hied_hhGen_total  ------------------
Variable: WCpublic
------------------  loed_hhGen_total  ------------------
Variable: WCpublic
------------------  hied_hhGen_nairobi  ------------------
Variable: WCpublic
------------------  loed_hhGen_nairobi  ------------------
Variable: WCpublic
------------------  hied_hhGen_abidjan  ------------------
Variable: WCpublic
------------------  loed_hhGen_abidjan  ------------------
Variable: WCpublic


Unnamed: 0,index,exp_variable,subset,prop_exp,CI_pp_exp,prop_no_exp,CI_pp_no_exp,OR,p_OR_eq_1,p_OR_hi_1,p_OR_lo_1,table
0,0,WCpublic,hied_hhGen_total,68.141593,±8.59,4.72103,±1.93,43.166667,2.457526e-47,2.457526e-47,1.0,"[[77, 36], [22, 444]]"
1,0,WCpublic,loed_hhGen_total,70.769231,±6.38,12.443439,±3.08,17.035407,4.373129e-48,4.373129e-48,1.0,"[[138, 57], [55, 387]]"
2,0,WCpublic,hied_hhGen_nairobi,69.444444,±8.69,2.820513,±1.64,78.305785,2.7730169999999996e-50,2.7730169999999996e-50,1.0,"[[75, 33], [11, 379]]"
3,0,WCpublic,loed_hhGen_nairobi,72.131148,±6.5,4.844291,±2.48,50.840336,6.772532999999999e-57,6.772532999999999e-57,1.0,"[[132, 51], [14, 275]]"
4,0,WCpublic,hied_hhGen_abidjan,40.0,±42.94,14.473684,±7.91,3.939394,0.1799444,0.1799444,0.972624,"[[2, 3], [11, 65]]"
5,0,WCpublic,loed_hhGen_abidjan,50.0,±28.29,26.797386,±7.02,2.731707,0.1022931,0.087176,0.975516,"[[6, 6], [41, 112]]"


In [15]:
# Impact of education on location of WC

# ----------------------------------------------------------------------------------------------------
#                                          BOTH SITES
# ----------------------------------------------------------------------------------------------------

# Socioeconomic conditions - general population
# Join attributes (edu. level of HH to household dataset)
df_c = dst_data.copy()
# Recode WC location ("in"/"out")
public_sel = ["Public","Neighb_YardPlot","Neighb_Dwelling"]
df_c["WC_Loc_InOutPrem"] = np.nan
df_c["WC_Loc_InOutPrem"][df_c["ToiletFacility"].isin(public_sel)] = "out of premises"
df_c["WC_Loc_InOutPrem"][~df_c["ToiletFacility"].isin(public_sel+["NoFacility_Nature"])] = "within premises"

# general stats
print("Education level (head of household): \n",
      df_c["SecEduHH"].value_counts())
# define groups
lowmid_inc_t = df_c[df_c["SecEduHH"]==0]
midhigh_inc_t = df_c[df_c["SecEduHH"]==1]
lowmid_inc_t = lowmid_inc_t[~lowmid_inc_t["WC_Loc_InOutPrem"].isna()]
midhigh_inc_t = midhigh_inc_t[~midhigh_inc_t["WC_Loc_InOutPrem"].isna()]
# calculate % households using WC out of premises
lowmid_inc_t_prop = lowmid_inc_t[(lowmid_inc_t["WC_Loc_InOutPrem"]=="out of premises")].shape[0]/lowmid_inc_t.shape[0]
midhigh_inc_t_prop = midhigh_inc_t[(midhigh_inc_t["WC_Loc_InOutPrem"]=="out of premises")].shape[0]/midhigh_inc_t.shape[0]
# print results
P = lowmid_inc_t_prop
N = lowmid_inc_t.shape[0]
CI = 1.96*(np.sqrt((P*(1-P))/N))
print("% houses using WC OUT of premises, head of household w/ at most primary education:  \n",
      P*100)
print("N =",N)
print("CI = ±",CI*100,"%")
P = midhigh_inc_t_prop
N = midhigh_inc_t.shape[0]
CI = 1.96*(np.sqrt((P*(1-P))/N))
print("% houses using WC OUT of premises, head of household w/ at least secondary education:  \n",
      P*100)
print("N =",N)
print("CI = ±",CI*100,"%")

# -------------------- ODDS RATIO --------------------
print("------------------ ODDS RATIO ------------------")
table = np.array([[lowmid_inc_t[(lowmid_inc_t["WC_Loc_InOutPrem"]=="out of premises")].shape[0], lowmid_inc_t[lowmid_inc_t["WC_Loc_InOutPrem"]=="within premises"].shape[0]],
                  [midhigh_inc_t[(midhigh_inc_t["WC_Loc_InOutPrem"]=="out of premises")].shape[0], midhigh_inc_t[midhigh_inc_t["WC_Loc_InOutPrem"]=="within premises"].shape[0]]])
print(table)
oddsratio, pvalue = stats.fisher_exact(table)
print("OddsR: ", oddsratio, "p-Value (OR = 1):", pvalue)
oddsratio, pvalue = stats.fisher_exact(table,alternative="greater")
print("p-Value (OR > 1):", pvalue)
oddsratio, pvalue = stats.fisher_exact(table,alternative="less")
print("p-Value (OR < 1):", pvalue)

# ----------------------------------------------------------------------------------------------------
#                                          NAIROBI
# ----------------------------------------------------------------------------------------------------

print(" \n ------------------ NAIROBI ------------------")
# Socioeconomic conditions - General population
# Join attributes (edu. level of HH to household dataset)
df_c = dst_data_nairobi.copy()
# Recode WC location ("in"/"out")
public_sel = ["Public","Neighb_YardPlot","Neighb_Dwelling"]
df_c["WC_Loc_InOutPrem"] = np.nan
df_c["WC_Loc_InOutPrem"][df_c["ToiletFacility"].isin(public_sel)] = "out of premises"
df_c["WC_Loc_InOutPrem"][~df_c["ToiletFacility"].isin(public_sel+["NoFacility_Nature"])] = "within premises"

# general stats
print("Education level (head of household): \n",
      df_c["SecEduHH"].value_counts())
# define groups
lowmid_inc_t = df_c[df_c["SecEduHH"]==0]
midhigh_inc_t = df_c[df_c["SecEduHH"]==1]
lowmid_inc_t = lowmid_inc_t[~lowmid_inc_t["WC_Loc_InOutPrem"].isna()]
midhigh_inc_t = midhigh_inc_t[~midhigh_inc_t["WC_Loc_InOutPrem"].isna()]
# calculate % households using WC out of premises
lowmid_inc_t_prop = lowmid_inc_t[(lowmid_inc_t["WC_Loc_InOutPrem"]=="out of premises")].shape[0]/lowmid_inc_t.shape[0]
midhigh_inc_t_prop = midhigh_inc_t[(midhigh_inc_t["WC_Loc_InOutPrem"]=="out of premises")].shape[0]/midhigh_inc_t.shape[0]
# print results
P = lowmid_inc_t_prop
N = lowmid_inc_t.shape[0]
CI = 1.96*(np.sqrt((P*(1-P))/N))
print("% houses using WC OUT of premises, head of household w/ at most primary education:  \n",
      P*100)
print("N =",N)
print("CI = ±",CI*100,"%")
P = midhigh_inc_t_prop
N = midhigh_inc_t.shape[0]
CI = 1.96*(np.sqrt((P*(1-P))/N))
print("% houses using WC OUT of premises, head of household w/ at least secondary education:  \n",
      P*100)
print("N =",N)
print("CI = ±",CI*100,"%")

# -------------------- ODDS RATIO --------------------
print("------------------ ODDS RATIO ------------------")
table = np.array([[lowmid_inc_t[(lowmid_inc_t["WC_Loc_InOutPrem"]=="out of premises")].shape[0], lowmid_inc_t[lowmid_inc_t["WC_Loc_InOutPrem"]=="within premises"].shape[0]],
                  [midhigh_inc_t[(midhigh_inc_t["WC_Loc_InOutPrem"]=="out of premises")].shape[0], midhigh_inc_t[midhigh_inc_t["WC_Loc_InOutPrem"]=="within premises"].shape[0]]])
print(table)
oddsratio, pvalue = stats.fisher_exact(table)
print("OddsR: ", oddsratio, "p-Value (OR = 1):", pvalue)
oddsratio, pvalue = stats.fisher_exact(table,alternative="greater")
print("p-Value (OR > 1):", pvalue)
oddsratio, pvalue = stats.fisher_exact(table,alternative="less")
print("p-Value (OR < 1):", pvalue)

# ----------------------------------------------------------------------------------------------------
#                                          ABIDJAN
# ----------------------------------------------------------------------------------------------------

print(" \n ------------------ ABIDJAN ------------------")
# Socioeconomic conditions - General population
# Join attributes (edu. level of HH to household dataset)
df_c = dst_data_abidjan.copy()
# Recode WC location ("in"/"out")
public_sel = ["Public","Neighb_YardPlot","Neighb_Dwelling"]
df_c["WC_Loc_InOutPrem"] = np.nan
df_c["WC_Loc_InOutPrem"][df_c["ToiletFacility"].isin(public_sel)] = "out of premises"
df_c["WC_Loc_InOutPrem"][~df_c["ToiletFacility"].isin(public_sel+["NoFacility_Nature"])] = "within premises"

# general stats
print("Education level (head of household): \n",
      df_c["SecEduHH"].value_counts())
# define groups
lowmid_inc_t = df_c[df_c["SecEduHH"]==0]
midhigh_inc_t = df_c[df_c["SecEduHH"]==1]
lowmid_inc_t = lowmid_inc_t[~lowmid_inc_t["WC_Loc_InOutPrem"].isna()]
midhigh_inc_t = midhigh_inc_t[~midhigh_inc_t["WC_Loc_InOutPrem"].isna()]
# calculate % households using WC out of premises
lowmid_inc_t_prop = lowmid_inc_t[(lowmid_inc_t["WC_Loc_InOutPrem"]=="out of premises")].shape[0]/lowmid_inc_t.shape[0]
midhigh_inc_t_prop = midhigh_inc_t[(midhigh_inc_t["WC_Loc_InOutPrem"]=="out of premises")].shape[0]/midhigh_inc_t.shape[0]
# print results
P = lowmid_inc_t_prop
N = lowmid_inc_t.shape[0]
CI = 1.96*(np.sqrt((P*(1-P))/N))
print("% houses using WC OUT of premises, head of household w/ at most primary education:  \n",
      P*100)
print("N =",N)
print("CI = ±",CI*100,"%")
P = midhigh_inc_t_prop
N = midhigh_inc_t.shape[0]
CI = 1.96*(np.sqrt((P*(1-P))/N))
print("% houses using WC OUT of premises, head of household w/ at least secondary education:  \n",
      P*100)
print("N =",N)
print("CI = ±",CI*100,"%")

# -------------------- ODDS RATIO --------------------
print("------------------ ODDS RATIO ------------------")
table = np.array([[lowmid_inc_t[(lowmid_inc_t["WC_Loc_InOutPrem"]=="out of premises")].shape[0], lowmid_inc_t[lowmid_inc_t["WC_Loc_InOutPrem"]=="within premises"].shape[0]],
                  [midhigh_inc_t[(midhigh_inc_t["WC_Loc_InOutPrem"]=="out of premises")].shape[0], midhigh_inc_t[midhigh_inc_t["WC_Loc_InOutPrem"]=="within premises"].shape[0]]])
print(table)
oddsratio, pvalue = stats.fisher_exact(table)
print("OddsR: ", oddsratio, "p-Value (OR = 1):", pvalue)
oddsratio, pvalue = stats.fisher_exact(table,alternative="greater")
print("p-Value (OR > 1):", pvalue)
oddsratio, pvalue = stats.fisher_exact(table,alternative="less")
print("p-Value (OR < 1):", pvalue)

Education level (head of household): 
 1.0    797
0.0    742
Name: SecEduHH, dtype: int64
% houses using WC OUT of premises, head of household w/ at most primary education:  
 26.4945652173913
N = 736
CI = ± 3.1882714344135943 %
% houses using WC OUT of premises, head of household w/ at least secondary education:  
 14.195979899497488
N = 796
CI = ± 2.424577347735175 %
------------------ ODDS RATIO ------------------
[[195 541]
 [113 683]]
OddsR:  2.1786105703956946 p-Value (OR = 1): 2.379476876096816e-09
p-Value (OR > 1): 1.29096135151982e-09
p-Value (OR < 1): 0.9999999994192625
 
 ------------------ NAIROBI ------------------
Education level (head of household): 
 1.0    539
0.0    500
Name: SecEduHH, dtype: int64
% houses using WC OUT of premises, head of household w/ at most primary education:  
 37.044534412955464
N = 494
CI = ± 4.258644494889268 %
% houses using WC OUT of premises, head of household w/ at least secondary education:  
 20.074349442379184
N = 538
CI = ± 3.384766884

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_c["WC_Loc_InOutPrem"][df_c["ToiletFacility"].isin(public_sel)] = "out of premises"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_c["WC_Loc_InOutPrem"][~df_c["ToiletFacility"].isin(public_sel+["NoFacility_Nature"])] = "within premises"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_c["WC_Loc_InOutPrem"][df_c["ToiletFacility"].isin(public_sel)] = "out of premises"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the docum

### 4.2.2 Stratify by gender

In [16]:
## Extract gender of heads of households

# Subset relevant data
subset = data_IND_tot[['PARENT_KEY','KEY',
                       'Relation_to_HH','Age',
                       'Sex']].copy()
orig_data = data_IND_tot.copy()
# Subset 1: Extract heads of households
subset1 = subset[(subset.Relation_to_HH=='Head')&(subset.Age>=18)]
# Subset 2: Extract households with no appointed head
subset2 = subset[~subset.PARENT_KEY.isin(list(subset1.PARENT_KEY.unique()))]
# Remove duplicates from subset 1 (keep women, i.e., preferred choice for interview)
subset3 = subset1[(~subset1['PARENT_KEY'].duplicated(keep=False))] # subset3 : no duplicates
subset4 = subset1[(subset1['PARENT_KEY'].duplicated())&(subset1['Sex']=='F')] # subset4 : if duplicate, keep woman
subset5 = subset1[(~subset1.PARENT_KEY.isin(list(subset3.PARENT_KEY)+list(subset4.PARENT_KEY)))&
                  (subset1.Age>=18)]
subset5 = subset5[(subset5['PARENT_KEY'].duplicated(keep='first'))] # subset5 : if 2 men declared as head, get first adult
# Remove duplicates from subset 2 (households with no appointed head, keep women, i.e., preferred choice for interview)
subset6 = subset2[(~subset2['PARENT_KEY'].duplicated(keep=False))&(subset2.Age>=18)] # subset6 : no duplicates
subset7 = subset2[~subset2['PARENT_KEY'].isin(list(subset6['PARENT_KEY'].unique()))]
subset7 = subset7[(subset7.Age>=18)&(subset7['Sex']=='F')] # subset7 : if duplicate, keep woman
subset7 = subset7[(~subset7['PARENT_KEY'].duplicated(keep='first'))]
subset8 = subset2[(~subset2.PARENT_KEY.isin(list(subset6.PARENT_KEY)+list(subset7.PARENT_KEY)))] # subset8 : add remaining households and delete duplicates
subset8 = subset8[(~subset8['PARENT_KEY'].duplicated(keep='first'))]
# Locate unique heads of households (using 'KEY' unique identifier)
subset = subset[subset.KEY.isin(list(subset3.KEY.unique())+list(subset4.KEY.unique())+
                                list(subset5.KEY.unique())+list(subset6.KEY.unique())+
                                list(subset7.KEY.unique())+list(subset8.KEY.unique()))]

# Check subset shape
if (subset.shape[0] - len(subset.KEY.unique()))==0:
    print('Extracting gender of head of household...')
    print('Extraction OK')
    print('Total obs.:',subset.shape[0])
else:
    print('------- WARNING -------')
    print('(!) Subset NOT OK (!)')
    print('>> Check subset: total number of obs. must match number of unique households IDs (PARENT_KEY)')

# Recode Sex
subset['Sex_HH_F'] = np.nan
subset['Sex_HH_F'][subset['Sex']=='M'] = 0
subset['Sex_HH_F'][subset['Sex']=='F'] = 1
    
# Attribute gender of heads of households
subset = subset[['PARENT_KEY','Sex_HH_F']]
subset = subset.rename(columns={"PARENT_KEY": "KEY"})
dst_data = dst_data.merge(subset,on="KEY",how='left')
dst_data_nairobi = dst_data_nairobi.merge(subset,on="KEY",how='left')
dst_data_abidjan = dst_data_abidjan.merge(subset,on="KEY",how='left')
dst_data_U5 = dst_data_U5.merge(subset,on="KEY",how='left')
dst_data_U5_nairobi = dst_data_U5_nairobi.merge(subset,on="KEY",how='left')
dst_data_U5_abidjan = dst_data_U5_abidjan.merge(subset,on="KEY",how='left')

Extracting gender of head of household...
Extraction OK
Total obs.: 1714


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['Sex_HH_F'][subset['Sex']=='M'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['Sex_HH_F'][subset['Sex']=='F'] = 1


In [17]:
## Stratify subsets

# Group by confounding variable
c_var = 'Sex_HH_F'
fmhh_dst_data = dst_data[dst_data[c_var]==1] # Female head of household, aggregated dataset
mlhh_dst_data = dst_data[dst_data[c_var]==0] # Male head of household, aggregated dataset
fmhh_dst_data_nairobi = dst_data_nairobi[dst_data_nairobi[c_var]==1] # Female head of household, Nairobi subset
mlhh_dst_data_nairobi = dst_data_nairobi[dst_data_nairobi[c_var]==0] # Male head of household, Nairobi subset
fmhh_dst_data_abidjan = dst_data_abidjan[dst_data_abidjan[c_var]==1] # Female head of household, Abidjan subset
mlhh_dst_data_abidjan = dst_data_abidjan[dst_data_abidjan[c_var]==0] # Male head of household, Abidjan subset

# Subsets list
subsets_gen = [fmhh_dst_data,mlhh_dst_data,
               fmhh_dst_data_nairobi,mlhh_dst_data_nairobi,
               fmhh_dst_data_abidjan,mlhh_dst_data_abidjan
              ]
subsets_gen_str = ['fmhh_hhGen_total','mlhh_hhGen_total',
                   'fmhh_hhGen_nairobi','mlhh_hhGen_nairobi',
                   'fmhh_hhGen_abidjan','mlhh_hhGen_abidjan'
                  ]


## Risk of feeling unsafe to access WC

# Exposure variable
exposure_lst = ["WCpublic"] # whether toilet is OUT of premises

# Calculate odds ratios
outcome_var = "WCunsafe"
outcome_pos = 1
outcome_neg = 0
df_oddsr_WCunsafe = pd.DataFrame()
for idx, subset in enumerate(subsets_gen):
    print("------------------ ",subsets_gen_str[idx]," ------------------")
    for exposure in exposure_lst:
        # define groups
        print("Variable:",exposure)
        exposure_grp = subset[subset[exposure]==1]
        no_exposure_grp = subset[subset[exposure]==0]
        exposure_grp = exposure_grp[~exposure_grp[outcome_var].isna()]
        no_exposure_grp = no_exposure_grp[~no_exposure_grp[outcome_var].isna()]
        # set table for Fisher tests
        table = np.array([[exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0], exposure_grp[exposure_grp[outcome_var]==outcome_neg].shape[0]],
                          [no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0], no_exposure_grp[no_exposure_grp[outcome_var]==outcome_neg].shape[0]]])
        # calculate proportion of households with at least 1 case
        exposure_grp_prop = exposure_grp[exposure_grp[outcome_var]==outcome_pos].shape[0]/exposure_grp.shape[0]
        no_exposure_grp_prop = no_exposure_grp[no_exposure_grp[outcome_var]==outcome_pos].shape[0]/no_exposure_grp.shape[0]
        # calculate 95% CI - exposure group
        P_exp = exposure_grp_prop
        N_exp = exposure_grp.shape[0]
        CI_exp = 1.96*(np.sqrt((P_exp*(1-P_exp))/N_exp))
        # calculate 95% CI - no exposure group
        P_ne = no_exposure_grp_prop
        N_ne = no_exposure_grp.shape[0]
        CI_ne = 1.96*(np.sqrt((P_ne*(1-P_ne))/N_ne))
        # run Fisher tests for OR = 1
        oddsratio_eq1, pvalue_eq1 = stats.fisher_exact(table)
        # run Fisher tests for OR > 1
        oddsratio_greater1, pvalue_greater1 = stats.fisher_exact(table,alternative="greater")
        # run Fisher tests for OR < 1
        oddsratio_less1, pvalue_less1 = stats.fisher_exact(table,alternative="less")
        # add results to dataframe
        df_oddsr_WCunsafe = df_oddsr_WCunsafe.append([[exposure,
                                             subsets_gen_str[idx],
                                             exposure_grp_prop*100,
                                             '±'+str(round(CI_exp*100,2)),
                                             no_exposure_grp_prop*100,
                                             '±'+str(round(CI_ne*100,2)),
                                             oddsratio_eq1,
                                             pvalue_eq1,
                                             pvalue_greater1,
                                             pvalue_less1,
                                             table
                                            ]])

# Reset columns' names & index
df_oddsr_WCunsafe.columns = ['exp_variable','subset',
                             'prop_exp','CI_pp_exp','prop_no_exp','CI_pp_no_exp',
                             'OR','p_OR_eq_1','p_OR_hi_1','p_OR_lo_1',
                             'table'
                            ]
df_oddsr_WCunsafe = df_oddsr_WCunsafe.reset_index()

# Check results
df_oddsr_WCunsafe

------------------  fmhh_hhGen_total  ------------------
Variable: WCpublic
------------------  mlhh_hhGen_total  ------------------
Variable: WCpublic
------------------  fmhh_hhGen_nairobi  ------------------
Variable: WCpublic
------------------  mlhh_hhGen_nairobi  ------------------
Variable: WCpublic
------------------  fmhh_hhGen_abidjan  ------------------
Variable: WCpublic
------------------  mlhh_hhGen_abidjan  ------------------
Variable: WCpublic


Unnamed: 0,index,exp_variable,subset,prop_exp,CI_pp_exp,prop_no_exp,CI_pp_no_exp,OR,p_OR_eq_1,p_OR_hi_1,p_OR_lo_1,table
0,0,WCpublic,fmhh_hhGen_total,76.436782,±6.31,9.217877,±3.0,31.947524,6.584834999999999e-56,6.584834999999999e-56,1.0,"[[133, 41], [33, 325]]"
1,0,WCpublic,mlhh_hhGen_total,63.690476,±7.27,8.536585,±2.14,18.793911,8.915441e-49,8.915441e-49,1.0,"[[107, 61], [56, 600]]"
2,0,WCpublic,fmhh_hhGen_nairobi,76.023392,±6.4,3.214286,±2.07,95.474255,3.836812e-64,3.836812e-64,1.0,"[[130, 41], [9, 271]]"
3,0,WCpublic,mlhh_hhGen_nairobi,66.233766,±7.47,3.829787,±1.74,49.25641,3.030878e-58,3.030878e-58,1.0,"[[102, 52], [18, 452]]"
4,0,WCpublic,fmhh_hhGen_abidjan,100.0,±0.0,30.769231,±10.24,inf,0.0342827,0.0342827,1.0,"[[3, 0], [24, 54]]"
5,0,WCpublic,mlhh_hhGen_abidjan,35.714286,±25.1,20.430108,±5.79,2.163743,0.1860681,0.1564802,0.946687,"[[5, 9], [38, 148]]"


# Summary of association between WC location, safety and education

#### Even in analyses stratified by education and gender, the trend stayed consistend: WC location impacts on safety

- Education : associated with WC location (which may explain impact on safety)  
Globally, both sites showed the same trend: even in stratified analyses (controlling by education level of head of household), toilet location significantly increased the risk of lack of safety.  
Moreover, the analysis above shows that a higher education level was associated with a higher likelihood of NOT using a toilet outside of premises. This can explain why lower levels of education were associated with higer risk of lack of safety: education was INDIRECTLY related to safety (as the use of toilets out of premises reduces safety, and people with higher education level tend to use toilets INSIDE premises).  

- Gender : females tend to feel more unsafe
Gender affected perceived safety, with odds ratios for women being consistently higher than for men. But even accounting for these differences, the impact of WC location significantly affected safety for both groups (WC out of premises being positively correlated to lack of safety in both cases)