In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
pal = sns.color_palette()

In [2]:
#Read the raw data from .dta file
data_dta = pd.io.stata.read_stata('../Data/ZDinputdata_Jan23.dta')
#print(data_dta.head())
print(data_dta.columns)

Index(['countrywave', 'caseid', 'midx', 'dhsid', 'v000', 'v001', 'v002',
       'v003', 'v007', 'v024', 'ch_allvac_either', 'ch_allvac_moth',
       'ch_allvac_card', 'ch_novac_either', 'ch_novac_moth', 'ch_novac_card',
       'age', 'male', 'birth_order', 'mum_educlow', 'mum_educhigher',
       'npregnancies', 'firstpreg', 'hh_5plus', 'hh_urban', 'hh_wealth',
       'anc_any', 'anc_number', 'facdelivery', 'regionname_original',
       'IA2015district', 'ia2015fic', 'IA2020district',
       'v024_states_asin_IA202020', 'geo_ia2015', 'dhs_ipumsi_ia',
       'dhs_ipumsi_ml', 'dhs_ipumsi_ng', 'regionname', 'gps_dataset', 'dhscc',
       'dhsyear', 'dhsclust', 'surveyid', 'all_population_count_2015',
       'nightlights_composite', 'travel_times_2015', 'u5_population_2015',
       'un_population_count_2015', 'un_population_density_2015', 'urban_rura',
       'latnum', 'longnum', 'datum', 'clusterid'],
      dtype='object')


In [3]:
#Preprocessing!
#https://dhsprogram.com/pubs/pdf/DHSG4/Recode7_DHS_10Sep2018_DHSG4.pdf (Description)


#create label any_vac: 1 means at least one vaccine obtained and 0 means no vaccine
data = data_dta.rename(columns={"ch_novac_either": "any_vac"})
data["any_vac"] = data["any_vac"].replace({0:1, 1:0})
data['all_vac'] = data['ch_allvac_either']

#convert anc_number to categories
data['anc_number'] = data["anc_number"].replace("no antenatal visits", 0)
bins= [0, 1, 4, 100]
labels = ['no','low','high']
data['anc_cat'] = pd.cut(data['anc_number'], bins=bins, labels=labels, right=False)

#convert age to categories
bins= [12,15,18,21,24]
labels = ['12-14','15-17','18-20','21-24']
data['age_group'] = pd.cut(data['age'], bins=bins, labels=labels, right=False)


#replace na with -1 for population density
data['un_population_count_2015'] = data['un_population_density_2015'].fillna(-1)
#categorize population into (<300, 300-1500, 1500+)
bins= [-1, 0, 300, 1500, max(data['un_population_count_2015'])]
labels = ['NA','Village','Town','City']
data['un_population_cat'] = pd.cut(data['un_population_count_2015'], bins=bins, labels=labels, right=False)

#create a district code variable for IA 2015 and IA 2020
data['sdist'] = data['IA2015district'].fillna(data['IA2020district'])

sparse_predictors = ["age_group", "male", "anc_cat", "facdelivery", "hh_urban", "v024"] #administrative data
contextual_predictors = ["nightlights_composite", "un_population_cat", "travel_times_2015"] #custer level data from censor
additional_predictors = ["birth_order", "mum_educlow", "mum_educhigher",
                        "npregnancies", "firstpreg", "hh_5plus", "hh_wealth"]# survey data
                        #hh_plus5= #family_members>5

extra_features = ['sdist', 'regionname', 'clusterid', 'ia2015fic', 'all_vac']

contextual_datasets = ['IA2020', 'IA2015','ML2018','NG2018']
print(data.columns)
print(data[data['countrywave']=='IA2015'].shape[0])

Index(['countrywave', 'caseid', 'midx', 'dhsid', 'v000', 'v001', 'v002',
       'v003', 'v007', 'v024', 'ch_allvac_either', 'ch_allvac_moth',
       'ch_allvac_card', 'any_vac', 'ch_novac_moth', 'ch_novac_card', 'age',
       'male', 'birth_order', 'mum_educlow', 'mum_educhigher', 'npregnancies',
       'firstpreg', 'hh_5plus', 'hh_urban', 'hh_wealth', 'anc_any',
       'anc_number', 'facdelivery', 'regionname_original', 'IA2015district',
       'ia2015fic', 'IA2020district', 'v024_states_asin_IA202020',
       'geo_ia2015', 'dhs_ipumsi_ia', 'dhs_ipumsi_ml', 'dhs_ipumsi_ng',
       'regionname', 'gps_dataset', 'dhscc', 'dhsyear', 'dhsclust', 'surveyid',
       'all_population_count_2015', 'nightlights_composite',
       'travel_times_2015', 'u5_population_2015', 'un_population_count_2015',
       'un_population_density_2015', 'urban_rura', 'latnum', 'longnum',
       'datum', 'clusterid', 'all_vac', 'anc_cat', 'age_group',
       'un_population_cat', 'sdist'],
      dtype='object')
462

In [4]:
data_reduced= data[["any_vac", 'countrywave'] + sparse_predictors + additional_predictors + contextual_predictors + extra_features]
print(data_reduced.columns)
print(data_reduced.shape)
print(data_reduced[data_reduced['countrywave']=='IA2015'].shape[0])

Index(['any_vac', 'countrywave', 'age_group', 'male', 'anc_cat', 'facdelivery',
       'hh_urban', 'v024', 'birth_order', 'mum_educlow', 'mum_educhigher',
       'npregnancies', 'firstpreg', 'hh_5plus', 'hh_wealth',
       'nightlights_composite', 'un_population_cat', 'travel_times_2015',
       'sdist', 'regionname', 'clusterid', 'ia2015fic', 'all_vac'],
      dtype='object')
(107024, 23)
46209


In [5]:
#split data into multiple countries; also categorize nighttime light into 11 bins
import statistics as s
countries = ["ML2018", "NG2018", "ML2006", "IA2006",  "NG2008", "IA2015", "IA2020"]
for country in countries:
    df = data_reduced[data_reduced['countrywave']==country]
    #df['nightlights_composite'].fillna(0, inplace=True) #replacing nightlight 'na' with 0
    #df.dropna(inplace=True)
    print(country)
    if(country in contextual_datasets):
        df = df[df['nightlights_composite'].notnull()] # Removed all the rows with no nightlight information
        x = df['nightlights_composite']
        #print(x.unique())
        x = x[x>0]
        min_x = min(x)
        max_x = max(x)+1 
        deciles = s.quantiles(x, n=10)
        deciles = [0, min_x]+deciles + [max_x]
        labels = ['zero','first','second','third', 'fourth', 'fifth', 'sixth', 'seventh', 'eight', 'nineth', 'tenth']
        df['nightlights_composite'] = (pd.cut(df['nightlights_composite'], bins=deciles, labels=labels, right=False))
        if(country not in ['IA2020', 'IA2015']):
             df = df.drop(['sdist'], axis=1)
        if(country in ['IA2015']):
            df['IMI_target'] = (df['ia2015fic']<0.8).astype(int)
        else:
            df = df.drop(['ia2015fic'], axis=1)
    else:
        df = df.drop(['sdist', 'ia2015fic']+contextual_predictors, axis=1)    

    df = df.drop('countrywave', axis=1)
    df.to_csv('../data/clean_data/'+country+'.csv', index=False)
print('clean CSV files created')

ML2018
NG2018
ML2006
IA2006
NG2008
IA2015
IA2020
clean CSV files created


In [6]:
#Utility functions
categorical_predictors = ['age_group', 'anc_cat', 'un_population_cat', 'nightlights_composite', 'v024', 'clusterid', 'hh_wealth']

#Read file from CSV and return a dataframe
def read_file(country, verbose = False):
    df = (pd.read_csv('../data/clean_data/' + country + ".csv"))
    for feature in categorical_predictors:
        if feature in df.columns:
            df[feature] = df[feature].astype('category')
    if verbose:
        print('Read complete: Clean data for '+country)
        print(df.dtypes)
    return df

In [7]:
#Find the number of missing data points
countries = ["IA2020", "IA2015", "IA2006", "ML2018", "ML2006", "NG2018", "NG2008"]
print("Before removing missing values, After removing mising values, difference, fraction of ZD, nv024, nregionnames")
for country in countries:
    d = read_file(country)
    d2 = d.dropna()
    d_old = data_reduced[data_reduced['countrywave']==country]
    nv024 = d2['v024'].unique().shape[0]
    nregions = d2['regionname'].unique().shape[0]
    print("Country:"+country+", "+str(d_old.shape[0])+", "+str(d2.shape[0])+", "+str(d_old.shape[0]-d2.shape[0]) + ", "+str(1-np.mean(d2['any_vac']))+", "+str(nv024)+", "+ str(nregions))
    #Add updated numbers to the draft
    

Before removing missing values, After removing mising values, difference, fraction of ZD, nv024, nregionnames
Country:IA2020, 40555, 40290, 265, 0.037081161578555455, 30, 30
Country:IA2015, 46209, 45977, 232, 0.06672901668225417, 30, 30
Country:IA2006, 8978, 8978, 0, 0.05981287591891293, 26, 26
Country:ML2018, 1803, 1743, 60, 0.1583476764199656, 8, 8
Country:ML2006, 2440, 2440, 0, 0.12786885245901636, 8, 8
Country:NG2018, 2350, 2292, 58, 0.16099476439790572, 37, 37
Country:NG2008, 4689, 4689, 0, 0.2945190872254212, 37, 37


In [9]:
d = read_file('IA2015')
d = d.dropna()
unique_IDs = (d['v024'].unique())
for id in unique_IDs:
    print(str(id)+"="+d[d['v024']==id]['regionname'].unique())

['1041=andaman and nicobar islands']
['1028=andhra pradesh and telangana']
['1012=arunachal pradesh']
['1018=assam']
['1010=bihar and jharkhand']
['1042=chandigarh']
['1023=madhya pradesh and chhattisgarh']
['1070=goa; dadra and nagar haveli; daman and diu']
['1024=gujarat']
['1006=haryana']
['1002=himachal pradesh']
['1001=jammu and kashmir']
['1029=karnataka']
['1032=kerala']
['1044=lakshadweep']
['1027=maharashtra']
['1014=manipur']
['1017=meghalaya']
['1015=mizoram']
['1013=nagaland']
['1007=delhi']
['1021=orissa']
['1045=pondicherry']
['1003=punjab']
['1008=rajasthan']
['1011=sikkim']
['1033=tamil nadu']
['1016=tripura']
['1009=uttar pradesh and uttaranchal']
['1019=west bengal']
