In [1]:
import os
from pathlib import Path
from os import path

import numpy as np
import pandas as pd

# 1. Input data

In [2]:
cwd = os.getcwd()
root_dir = Path(cwd).parent

In [3]:
## Input data

# Individual data
data_IND_tot = pd.read_csv(path.join(root_dir,
                                    "data/individual_data.csv")).drop(['Unnamed: 0'],axis=1)

# 2. Preprocess data

## 2.1. Recode diarrhea

In [4]:
## Define cases of diarrhoea

# Dummy dataframe
df = data_IND_tot.copy()

# Recode diarrhoea
case_mask = (df["Diarrhoea"]=="Y") # participant had diarrhoea in the 2 weeks preceeding the survey
df["Case"] = np.nan
df["Case"][~df["Diarrhoea"].isna()] = 0
df["Case"][case_mask] = 1

## 2.2 Recode relevant variables

In [5]:
## Recode sex

# Males
male_mask = (df["Sex"]=="M")
df["Males"] = np.nan
df["Males"][~df["Sex"].isna()] = 0
df["Males"][male_mask] = 1

# Females
female_mask = (df["Sex"]=="F")
df["Females"] = np.nan
df["Females"][~df["Sex"].isna()] = 0
df["Females"][female_mask] = 1

# Undefined
nd_mask = (df["Sex"].isna())
df["no data/Sex"] = 0
df["no data/Sex"][nd_mask] = 1

# 3. Descriptive stats

In [6]:
## Subsets

# General population
df_abi = df[df['City']=='Abidjan'].copy()
print("N for general pop. in Abidjan:",df_abi.shape[0])
df_nai = df[df['City']=='Nairobi'].copy()
print("N for general pop. in Nairobi:",df_nai.shape[0])

# Children under five
mask = (df["Age_group"]=='under five')
df_U5 = df[mask]
df_U5_abi = df_U5[df_U5['City']=='Abidjan'].copy()
print("N for pop. under-five in Abidjan:",df_U5_abi.shape[0])
df_U5_nai = df_U5[df_U5['City']=='Nairobi'].copy()
print("N for pop. under-five in Nairobi:",df_U5_nai.shape[0])

N for general pop. in Abidjan: 2498
N for general pop. in Nairobi: 3786
N for pop. under-five in Abidjan: 283
N for pop. under-five in Nairobi: 491


## 3.1 Surveyed population, by site, age group and sex

In [7]:
## Create columns for each age group

# Under-fives
yu5_mask = (df["Age_group"]=="under five")
df["under five"] = np.nan
df["under five"][~df["Age_group"].isna()] = 0
df["under five"][yu5_mask] = 1

# 5-17
y5_17_mask = (df["Age_group"]=="5 to 17")
df["5 to 17"] = np.nan
df["5 to 17"][~df["Age_group"].isna()] = 0
df["5 to 17"][y5_17_mask] = 1

# Adults (18+)
y18_mask = (df["Age_group"]=="adult (18 or older)")
df["adult (18 or older)"] = np.nan
df["adult (18 or older)"][~df["Age_group"].isna()] = 0
df["adult (18 or older)"][y18_mask] = 1

# Error / undefined
nd_mask = (df["Age_group"]=="unknown/did not answer")
df["no data/Age"] = 0
df["no data/Age"][nd_mask] = 1

## List variables (columns) of interest
var_list = ['under five','5 to 17','adult (18 or older)','no data/Age',# age groups
            'Males','Females','no data/Sex'# sex groups
           ]

## Descriptive statistics

# Counts, sum, and percentage
df_cnt = df[['Site']+var_list].dropna().groupby(['Site']).count().astype(int).reset_index()
df_sum = df[['Site']+var_list].dropna().groupby(['Site']).sum().astype(int).reset_index()
df_pc = df[['Site']+var_list].dropna().groupby(['Site']).mean()*100
df_pc = df_pc.reset_index()

# Build dataframe
df_table = df_cnt.merge(df_sum,on='Site')
df_table = df_table.merge(df_pc,on='Site')

# Rename columns
cnt_lst = []
tot_outcomes_lst = []
pc_lst = []
for var_name in var_list:
    col_name1 = 'series count '+var_name
    col_name2 = 'total count '+var_name
    col_name3 = 'percentage '+var_name
    cnt_lst = cnt_lst+[col_name1]
    tot_outcomes_lst = tot_outcomes_lst+[col_name2]
    pc_lst = pc_lst+[col_name3]
columns_list = cnt_lst+tot_outcomes_lst+pc_lst
df_table.columns = ['site']+columns_list

# Transpose table
df_table_t = df_table.T
df_table_t.columns = df_table_t.iloc[0]#set column names equal to values in row index position 0
df_table_t = df_table_t[1:]#remove first row from DataFrame

# View
df_table_t

site,Azito,Mabatini,Vietnam,Williamsville
series count under five,1191.0,1935.0,1851.0,1307.0
series count 5 to 17,1191.0,1935.0,1851.0,1307.0
series count adult (18 or older),1191.0,1935.0,1851.0,1307.0
series count no data/Age,1191.0,1935.0,1851.0,1307.0
series count Males,1191.0,1935.0,1851.0,1307.0
series count Females,1191.0,1935.0,1851.0,1307.0
series count no data/Sex,1191.0,1935.0,1851.0,1307.0
total count under five,124.0,226.0,265.0,159.0
total count 5 to 17,303.0,561.0,480.0,363.0
total count adult (18 or older),630.0,1143.0,1100.0,681.0


## 3.2. Prevalence of diarrhea, by site and age group

In [8]:
## Diarrhea prevalence in the general population, by site 

# List variables
var_list = ['Case']

# Count stats
df_cnt = df[['Site']+var_list].dropna().groupby(['Site']).count().astype(int).reset_index()
df_sum = df[['Site']+var_list].dropna().groupby(['Site']).sum().astype(int).reset_index()
df_pc = df[['Site']+var_list].dropna().groupby(['Site']).mean()*100
df_pc = df_pc.reset_index()

# Build dataframe
df_table = df_cnt.merge(df_sum,on='Site')
df_table = df_table.merge(df_pc,on='Site')

# Rename columns
df_table.columns = ['site','series count','total count','percentage']

# 95% CIs
ci_list = []
for P,N in zip(df_table['percentage'].to_list(),df_table['total count'].to_list()):
    ci = round((1.96*(np.sqrt((P*(100-P))/N))),2)
    ci_list = ci_list+['±'+str(ci)+'%']
df_table['95% CI'] = ci_list

# View
df_table

Unnamed: 0,site,series count,total count,percentage,95% CI
0,Azito,1165,174,14.935622,±5.3%
1,Mabatini,1922,259,13.475546,±4.16%
2,Vietnam,1844,193,10.466377,±4.32%
3,Williamsville,1273,179,14.061273,±5.09%


In [9]:
## Diarrhea prevalence in the general population, by city 

# List variables
var_list = ['Case']

# Count stats
df_cnt = df[['City']+var_list].dropna().groupby(['City']).count().astype(int).reset_index()
df_sum = df[['City']+var_list].dropna().groupby(['City']).sum().astype(int).reset_index()
df_pc = df[['City']+var_list].dropna().groupby(['City']).mean()*100
df_pc = df_pc.reset_index()

# Build dataframe
df_table = df_cnt.merge(df_sum,on='City')
df_table = df_table.merge(df_pc,on='City')

# Rename columns
df_table.columns = ['site','series count','total count','percentage']

# 95% CIs
ci_list = []
for P,N in zip(df_table['percentage'].to_list(),df_table['total count'].to_list()):
    ci = round((1.96*(np.sqrt((P*(100-P))/N))),2)
    ci_list = ci_list+['±'+str(ci)+'%']
df_table['95% CI'] = ci_list

# View
df_table

Unnamed: 0,site,series count,total count,percentage,95% CI
0,Abidjan,2438,353,14.479081,±3.67%
1,Nairobi,3766,452,12.002124,±3.0%


In [10]:
## Diarrhea prevalence in children under five years old, by site 

# List variables
var_list = ['Case']

df = df_U5.copy()

# Count stats
df_cnt = df[['Site']+var_list].dropna().groupby(['Site']).count().astype(int).reset_index()
df_sum = df[['Site']+var_list].dropna().groupby(['Site']).sum().astype(int).reset_index()
df_pc = df[['Site']+var_list].dropna().groupby(['Site']).mean()*100
df_pc = df_pc.reset_index()

# Build dataframe
df_table = df_cnt.merge(df_sum,on='Site')
df_table = df_table.merge(df_pc,on='Site')

# Rename columns
df_table.columns = ['site','series count','total count','percentage']

# 95% CIs
ci_list = []
for P,N in zip(df_table['percentage'].to_list(),df_table['total count'].to_list()):
    ci = round((1.96*(np.sqrt((P*(100-P))/N))),2)
    ci_list = ci_list+['±'+str(ci)+'%']
df_table['95% CI'] = ci_list

# View
df_table

Unnamed: 0,site,series count,total count,percentage,95% CI
0,Azito,124,34,27.419355,±15.0%
1,Mabatini,226,60,26.548673,±11.17%
2,Vietnam,265,67,25.283019,±10.41%
3,Williamsville,157,35,22.292994,±13.79%


In [11]:
## Diarrhea prevalence in children under five years old, by city 

# List variables
var_list = ['Case']

df = df_U5.copy()

# Count stats
df_cnt = df[['City']+var_list].dropna().groupby(['City']).count().astype(int).reset_index()
df_sum = df[['City']+var_list].dropna().groupby(['City']).sum().astype(int).reset_index()
df_pc = df[['City']+var_list].dropna().groupby(['City']).mean()*100
df_pc = df_pc.reset_index()

# Build dataframe
df_table = df_cnt.merge(df_sum,on='City')
df_table = df_table.merge(df_pc,on='City')

# Rename columns
df_table.columns = ['site','series count','total count','percentage']

# 95% CIs
ci_list = []
for P,N in zip(df_table['percentage'].to_list(),df_table['total count'].to_list()):
    ci = round((1.96*(np.sqrt((P*(100-P))/N))),2)
    ci_list = ci_list+['±'+str(ci)+'%']
df_table['95% CI'] = ci_list

# View
df_table

Unnamed: 0,site,series count,total count,percentage,95% CI
0,Abidjan,281,69,24.55516,±10.16%
1,Nairobi,491,127,25.86558,±7.62%
