In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
pal = sns.color_palette()

In [2]:
sparse_predictors = ["age_group", "male", "anc_cat", "facdelivery", "hh_urban", "v024"] #administrative data
contextual_predictors = ["nightlights_composite", "un_population_cat", "travel_times_2015"] #custer level data from censor
additional_predictors = ["birth_order", "mum_educlow", "mum_educhigher",
                        "npregnancies", "firstpreg", "hh_5plus", "hh_wealth"]# survey data
                       

extra_features = ['sdist', 'regionname', 'clusterid', 'all_vac', 'ia2015fic', 'IMI_target']

contextual_datasets = ['IA2020', 'IA2015','ML2018','NG2018']

In [4]:
#Utility functions
categorical_predictors = ['age_group', 'anc_cat', 'un_population_cat', 'nightlights_composite', 'v024', 'clusterid', 'hh_wealth']
extra_features = ['sdist', 'regionname', 'clusterid', 'all_vac', 'ia2015fic', 'IMI_target']

#Read file from CSV and return a dataframe
def read_file(country, verbose = False):
    df = (pd.read_csv('../data/clean_data/' + country + ".csv"))
    for feature in categorical_predictors:
        if feature in df.columns:
            df[feature] = df[feature].astype('category')
    if verbose:
        print('Read complete: Clean data for '+country)
        print(df.dtypes)
    return df

def get_clean_dataset(country, verbose = False):
    df = read_file(country, verbose)        
    df = df.dropna()
    if country =='IA2020':
        df = df.drop(['sdist', 'regionname', 'clusterid', 'all_vac'], axis=1)
    elif country == 'IA2015':        
        df = df.drop(extra_features, axis=1)
    else:
        df = df.drop(['regionname', 'clusterid', 'all_vac'], axis=1)
    if(verbose):
        print("Missing values and extra columns removed.")
    return df


In [7]:
#Summary statistics
countries = ['IA2020', 'IA2015', 'IA2006', 'ML2018', 'ML2006', 'NG2018', 'NG2008']
for country in countries:
    print(country)
    # Percentage zero-dose children
    output = pd.DataFrame()

    write_path = "../results/summary/summary_"+country+".txt"
    f= open(write_path,"w+")

    df = get_clean_dataset(country)
    npoints = df.shape[0]
    print("Number of data points: "+str(npoints))
    frac = sum(df['any_vac']==0)/npoints
    print("Number of ZD: "+str(frac))
    nregions = (df['v024'].unique()).shape[0]
    print("Number of regions: "+str(nregions)+'\n')
    df = df.drop('v024', axis=1)
    df['zd'] = 1-df['any_vac']

    #stats for continuous variables
    describe =  df.describe()
    n = df.shape[0]
    output['cols'] = describe.columns
    output['mean_cols'] = describe.values[1]
    output['std_cols'] = describe.values[2]
    with open(write_path, 'a') as f:
        dfAsString = output.to_string(header=True, index=False)
        f.write(dfAsString)
    f = open(write_path, 'a')
    f.write('\n')

    #stats for categorical variables
    for feature in categorical_predictors:
        if feature in df.columns:
            freq_df = pd.crosstab(index = df[feature], columns = 'freq')
            freq_df['freq'] = freq_df['freq']/n
            with open(write_path, 'a') as f:
                dfAsString = freq_df.to_string(header=True, index=True)
                f.write(dfAsString)    

IA2020
Number of data points: 40290
Number of ZD: 0.037081161578555476
Number of regions: 30

IA2015
Number of data points: 45977
Number of ZD: 0.06672901668225417
Number of regions: 30

IA2006
Number of data points: 8978
Number of ZD: 0.0598128759189129
Number of regions: 26

ML2018
Number of data points: 1743
Number of ZD: 0.15834767641996558
Number of regions: 8

ML2006
Number of data points: 2440
Number of ZD: 0.12786885245901639
Number of regions: 8

NG2018
Number of data points: 2292
Number of ZD: 0.16099476439790575
Number of regions: 37

NG2008
Number of data points: 4689
Number of ZD: 0.2945190872254212
Number of regions: 37



Run the code below, only if you need the summary results in tabular form! 

In [23]:
#Tabular form of Summary [No need to run this]
#Summary statistics (for creating tables for numeric features only)
countries_recent = ['IA2020', 'IA2015', 'ML2018', 'NG2018']
output = pd.DataFrame()
for country in countries_recent:

    df = get_clean_dataset(country)
    df = df.drop('v024', axis=1)
    df['zd'] = 1-df['any_vac']
    
    #stats for continuous variables
    describe =  df.describe()
    n = df.shape[0]
    output['cols'] = describe.columns
    output[country+'_mean_cols'] = describe.values[1]
    output[country+'_std_cols'] = describe.values[2]
output.to_csv("../results/summary/summary_recent_numeric.csv")

#Summary statistics
countries_old = ['IA2006', 'ML2006', 'NG2008']
output = pd.DataFrame()
for country in countries_old:

    df = get_clean_dataset(country)
    df = df.drop('v024', axis=1)
    df['zd'] = 1-df['any_vac']
    
    #stats for continuous variables
    describe =  df.describe()
    n = df.shape[0]
    output['cols'] = describe.columns
    output[country+'_mean_cols'] = describe.values[1]
    output[country+'_std_cols'] = describe.values[2]
output.to_csv("../results/summary/summary_old_numeric.csv")


In [24]:
#Tabular form of Summary [No need to run this]
#Summary statistics (for creating tables for categorical features only)
countries_recent = ['IA2020', 'IA2015', 'ML2018', 'NG2018']
for feature in categorical_predictors:
    output = pd.DataFrame()
    for country in countries_recent:
    # Percentage zero-dose children

        df = get_clean_dataset(country)
        df = df.drop('v024', axis=1)
        n = df.shape[0]
        if feature in df.columns:
            #print(feature)
            freq_df = pd.crosstab(index = df[feature], columns = 'freq')
            #output['categories'] = freq_df[0]
            if 'freq' not in output.columns:
                output = freq_df
            output[country+'freq'] = freq_df['freq']/n
    output.to_csv('../results/summary/summary_recent_'+feature+'.csv')

countries_old = ['IA2006', 'ML2006', 'NG2008']
for feature in categorical_predictors:
    output = pd.DataFrame()
    for country in countries_old:
    # Percentage zero-dose children

        df = get_clean_dataset(country)
        df = df.drop('v024', axis=1)
        n = df.shape[0]
        if feature in df.columns:
            freq_df = pd.crosstab(index = df[feature], columns = 'freq')
            #output['categories'] = freq_df[0]
            if 'freq' not in output.columns:
                output = freq_df
            output[country+'freq'] = freq_df['freq']/n
    output.to_csv('../results/summary/summary_old_'+feature+'.csv')

In [9]:
#Correlation matrix [No need to run this]
#Summary statistics with graphs for each country
import sweetviz as sv
import IPython
for country in countries:
    df = get_clean_dataset(country)
    report = sv.analyze(df)
    report.show_html('../results/summary/summary_'+country+'.html', open_browser=False)

    IPython.display.HTML('../results/summary/summary_'+country+'.html')

  all_source_names = [cur_name for cur_name, cur_series in source_df.iteritems()]
  filtered_series_names_in_source = [cur_name for cur_name, cur_series in source_df.iteritems()


                                             |          | [  0%]   00:00 -> (? left)

  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()


Report ../results/summary/summary_IA2020.html was generated.


  all_source_names = [cur_name for cur_name, cur_series in source_df.iteritems()]
  filtered_series_names_in_source = [cur_name for cur_name, cur_series in source_df.iteritems()


                                             |          | [  0%]   00:00 -> (? left)

  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()


Report ../results/summary/summary_IA2015.html was generated.


  all_source_names = [cur_name for cur_name, cur_series in source_df.iteritems()]
  filtered_series_names_in_source = [cur_name for cur_name, cur_series in source_df.iteritems()


                                             |          | [  0%]   00:00 -> (? left)

  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():


Report ../results/summary/summary_IA2006.html was generated.


  all_source_names = [cur_name for cur_name, cur_series in source_df.iteritems()]
  filtered_series_names_in_source = [cur_name for cur_name, cur_series in source_df.iteritems()


                                             |          | [  0%]   00:00 -> (? left)

  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()


Report ../results/summary/summary_ML2018.html was generated.


  all_source_names = [cur_name for cur_name, cur_series in source_df.iteritems()]
  filtered_series_names_in_source = [cur_name for cur_name, cur_series in source_df.iteritems()


                                             |          | [  0%]   00:00 -> (? left)

  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():


Report ../results/summary/summary_ML2006.html was generated.


  all_source_names = [cur_name for cur_name, cur_series in source_df.iteritems()]
  filtered_series_names_in_source = [cur_name for cur_name, cur_series in source_df.iteritems()


                                             |          | [  0%]   00:00 -> (? left)

  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()


Report ../results/summary/summary_NG2018.html was generated.


  all_source_names = [cur_name for cur_name, cur_series in source_df.iteritems()]
  filtered_series_names_in_source = [cur_name for cur_name, cur_series in source_df.iteritems()


                                             |          | [  0%]   00:00 -> (? left)

  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():
  for item in category_counts.iteritems():


Report ../results/summary/summary_NG2008.html was generated.
