# Niger DHS 2012

#### WEIGHTING
###### Design weights and sampling (survey) weights were computed for every household and ever-married women and never-married women selected to participate in the DHS 2012. A design weight is the inverse of probability of selecting a housing unit to be interviewed. The sampling weight of a household is the design weight corrected for non-response including other adjustments where necessary.

###### Define Functions

In [2]:
#import required packages
import pyreadstat
import warnings
warnings.filterwarnings("ignore")
from googletrans import Translator
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
import pandas as pd

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [3]:
# Function to calculate national vaccine coverage between specific age group
def calculate_vaccination_coverage(df, min_age, max_age, vaccine_column):
    # Filter rows where child_age is between min_age and max_age
    filtered_df = df[(df['child_age'] >= min_age) & (df['child_age'] <= max_age)]

    # Calculate the percentage of children vaccinated with the specified vaccine
    #total_children = filtered_df[vaccine_column].count()  # Count non-null values
    filtered_df_len=len(filtered_df)
    percentage_vaccinated = (filtered_df[vaccine_column].sum() / filtered_df_len) * 100
    #percentage_vaccinated = (filtered_df[vaccine_column].sum() /  total_children) * 100
    print("{} Vaccination Coverage for children between {} to {} months for Niger: {:.2f}%".format(vaccine_column, min_age, max_age, percentage_vaccinated))
    return percentage_vaccinated


# Example usage:
# calculate_vaccination_coverage(dfKR, 12, 23, 'bcg')


In [4]:
# Function to calculate vaccination coverage for each child_gender between specific age group
def calculate_vaccination_coverage_child_gender(df, min_age, max_age, child_gender, vaccine_column):
    # Filter rows where child_age is between min_age and max_age and child_gender matches
    filtered_df = df[(df['child_age'] >= min_age) & 
                     (df['child_age'] <= max_age) &
                     (df['child_gender'] == child_gender)]

    # Calculate the percentage of children vaccinated with the specified vaccine
    total_children = filtered_df[vaccine_column].count()  # Count non-null values in the vaccine_column
    if total_children == 0:
        print("No children found for the specified age range and child_gender.")
        return 0
    percentage_vaccinated = (filtered_df[vaccine_column].sum() / total_children) * 100
    #print("{} Vaccination Coverage for {} children between {} to {} months using KR dataset: {:.2f}%".format(vaccine_column, child_gender, min_age, max_age, percentage_vaccinated))
    return percentage_vaccinated


In [5]:
# Function to calculate vaccination coverage for each region between specific age group
def calculate_vaccination_coverage_region(df, min_age, max_age, region, vaccine_column):
    # Filter rows where child_age is between min_age and max_age and region matches
    filtered_df = df[(df['child_age'] >= min_age) & 
                     (df['child_age'] <= max_age) &
                     (df['region'] == region)]

    # Calculate the percentage of children vaccinated with the specified vaccine
    total_children = filtered_df[vaccine_column].count()  # Count non-null values in the vaccine_column
    if total_children == 0:
        print("No children found for the specified age range and region: {}.".format(region))
        return 0
    percentage_vaccinated = (filtered_df[vaccine_column].sum() / total_children) * 100
    #print("{} Vaccination Coverage for children in {} between {} to {} months using dataset: {:.2f}%".format(vaccine_column, region, min_age, max_age, percentage_vaccinated))
    return percentage_vaccinated


In [6]:
# Function to calculate vaccination coverage for each rsidence i.e urban/rural between specific age group
def calculate_vaccination_coverage_residence(df, min_age, max_age, residence, vaccine_column):
    # Filter rows where child_age is between min_age and max_age and region matches
    filtered_df = df[(df['child_age'] >= min_age) & 
                     (df['child_age'] <= max_age) &
                     (df['residence'] == residence)]

    # Calculate the percentage of children vaccinated with the specified vaccine
    total_children = filtered_df[vaccine_column].count()  # Count non-null values in the vaccine_column
    if total_children == 0:
        print("No children found for the specified age range and region: {}.".format(residence))
        return 0
    percentage_vaccinated = (filtered_df[vaccine_column].sum() / total_children) * 100
    #print("{} Vaccination Coverage for children in {} between {} to {} months using dataset: {:.2f}%".format(vaccine_column, residence, min_age, max_age, percentage_vaccinated))
    return percentage_vaccinated

In [7]:
def calculate_retention_rate(df, group_col, dose1_col, dose2_col):
    """
    Calculate retention rate from dose2 to dose1 for each group.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame containing vaccination data.
    group_col (str): Column name for the grouping variable.
    dose1_col (str): Column name representing the first dose.
    dose2_col (str): Column name representing the second dose.
    
    Returns:
    pandas.DataFrame: DataFrame with retention rates calculated and added as a new column.
    """
    # Group by the specified column and aggregate sum of dose1 and dose2
    df1 = df.groupby([group_col]).agg(
        sum_dose1=(dose1_col, 'sum'),
        sum_dose2=(dose2_col, 'sum')
    ).reset_index()
    
    # Calculate retention rate
    df1['retention_rate'] = (1 - ((df1['sum_dose1'] - df1['sum_dose2']) / df1['sum_dose1'])) * 100
    pivot_table2 = df1.pivot_table(columns=group_col, values='retention_rate')
    return pivot_table2
  

## Vaccination Coverage with Niger dataset

In [8]:
# Reading data into pandas dataframe
df=pd.read_stata("C:/Users/swalke/Desktop/RISP/NigerDHIS2012.DTA",convert_categoricals=False)

In [9]:
df.shape

(12558, 1129)

In [10]:
df.head()

Unnamed: 0,caseid,midx,v000,v001,v002,v003,v004,v005,v006,v007,...,s1307,idx94,s434a,idx95,s506y,s506yd,s506ym,s506yy,s538k,s538l
0,1 1 2,1,NI6,1,1,2,1,2066240,5,2012,...,0,1,,1,1.0,10.0,4.0,2012.0,,
1,1 1 2,2,NI6,1,1,2,1,2066240,5,2012,...,0,2,,2,2.0,,,,,
2,1 2 1,1,NI6,1,2,1,1,2066240,5,2012,...,0,1,,1,0.0,,,,0.0,0.0
3,1 2 1,2,NI6,1,2,1,1,2066240,5,2012,...,0,2,,2,0.0,,,,,
4,1 5 2,1,NI6,1,5,2,1,2066240,5,2012,...,0,1,,1,1.0,13.0,9.0,2011.0,,


In [11]:
# rename columns in dataframe
df = df.rename(columns={'caseid': 'case_id', 
                        'v025': 'residence', 'hw1':'child_age','v106':'respondent_education',
                        'h2':'bcg','h3':'dpt1','h4':'polio1','h5':'dpt2','b4' :'child_gender',
                        'h6':'polio2','h7':'dpt3','h8':'polio3','h9':'mcv',
                        'h0':'polio0', 'v024': 'region',
                        'v000': 'country','v005':'sample_weight','v190':'wealth_index','v191':'wealth_index_factor',
                        'v007':'interview_year','v006':'interview_month','v012':'respondent_age'})

In [12]:
# Define mapping variables
region_mapping = { 1 : "Agadez",
     2 :"Diffa",
     3 :"Dosso",
     4 :"Maradi",
     5 :"Tahoua",
     6 :"Tillaberi",
     7 :"Zinder",
     8 :"Niamey"}
residence_mapping = {1:'Urban',
                 2:'Rural',
                 }
child_gender_mapping = {1:'Male',
                 2:'Female'}
education_mapping= { 0: 'No education',
     1: 'Primary',
     2: 'Secondary',
     3: 'Higher',
     9: 'Unknown'}
wealth_index_mapping={ 1: "Poorest",
     2: "Poorer",
     3: "Middle",
     4: "Richer",
     5: "Richest"}

In [13]:
# Apply mappings to DataFrame columns
df['child_gender'] = df['child_gender'].replace(child_gender_mapping)
df['region'] = df['region'].replace(region_mapping)
df['residence'] = df['residence'].replace(residence_mapping)
df['respondent_education']= df['respondent_education'].replace(education_mapping)
df['wealth_index']=df['wealth_index'].replace(wealth_index_mapping)

In [14]:
# Define mapping
mapping = {0: False, 1: True, 2: True, 3: True, 8: np.nan, 9:np.nan}

# Replace values
df['bcg'] = df['bcg'].replace(mapping)
df['dpt1'] = df['dpt1'].replace(mapping)
df['dpt2'] = df['dpt2'].replace(mapping)
df['dpt3'] = df['dpt3'].replace(mapping)
df['polio1'] = df['polio1'].replace(mapping)
df['polio2'] = df['polio2'].replace(mapping)
df['polio3'] = df['polio3'].replace(mapping)
df['polio0'] = df['polio0'].replace(mapping)
df['mcv'] = df['mcv'].replace(mapping)

In [15]:
#Take the 'women's individual sample weight' variable and divide by 1 million.
df['sample_weight'] = df['sample_weight']/1000000

In [16]:
# Aplpy sampling weight to each vaccine variable
df['bcg_sampled'] = df['bcg'] * df['sample_weight']
df['dpt1_sampled'] = df['dpt1']* df['sample_weight']
df['dpt2_sampled'] = df['dpt2']* df['sample_weight']
df['dpt3_sampled'] = df['dpt3']* df['sample_weight']
df['polio1_sampled'] = df['polio1']* df['sample_weight']
df['polio2_sampled'] = df['polio2']* df['sample_weight']
df['polio3_sampled'] = df['polio3']* df['sample_weight']
df['polio0_sampled'] = df['polio0']* df['sample_weight']
df['mcv_sampled'] = df['mcv']* df['sample_weight']

In [17]:
columns_to_include = ['case_id','country','region','interview_year','interview_month',
                    'residence','child_age','child_gender',
                    'wealth_index','wealth_index_factor','respondent_age','respondent_education',
                    'bcg','dpt1','dpt2','dpt3','polio1','polio2','polio3','polio0',
                    'mcv','sample_weight']

In [18]:
df_download = df[columns_to_include].copy()
df_download['country']= 'Niger'
df_download['data_source']= 'Niger DHS 2012'
df_download.head(5)

Unnamed: 0,case_id,country,region,interview_year,interview_month,residence,child_age,child_gender,wealth_index,wealth_index_factor,...,dpt1,dpt2,dpt3,polio1,polio2,polio3,polio0,measles,sample_weight,data_source
0,1 1 2,Niger,Zinder,2012,5,Rural,,Female,Richer,-34581,...,True,True,True,True,True,True,False,True,2.06624,Niger DHS 2012
1,1 1 2,Niger,Zinder,2012,5,Rural,,Male,Richer,-34581,...,True,True,True,True,True,True,False,True,2.06624,Niger DHS 2012
2,1 2 1,Niger,Zinder,2012,5,Rural,,Male,Poorer,-57130,...,False,False,False,True,False,False,True,False,2.06624,Niger DHS 2012
3,1 2 1,Niger,Zinder,2012,5,Rural,,Female,Poorer,-57130,...,False,False,False,True,False,False,False,False,2.06624,Niger DHS 2012
4,1 5 2,Niger,Zinder,2012,5,Rural,16.0,Female,Poorer,-57976,...,True,True,True,True,True,True,True,True,2.06624,Niger DHS 2012


In [19]:
file_path = 'C:/Users/swalke/PATH BMGF RISP Dropbox/BMGF RISP Project/Quant analysis/Data landscape/CSV Data Files/NigerDHS2012.csv'
df_download.to_csv(file_path, index=False)

OSError: Cannot save file into a non-existent directory: 'C:\Users\swalke\PATH BMGF RISP Dropbox\BMGF RISP Project\Quant analysis\Data landscape\CSV Data Files'

#####  Calcualte vaccination coverage for each vaccine

In [22]:
calculate_vaccination_coverage(df, 12, 23, 'bcg_sampled')
calculate_vaccination_coverage(df, 12, 23, 'dpt1_sampled')
calculate_vaccination_coverage(df, 12, 23, 'dpt3_sampled')
calculate_vaccination_coverage(df, 12, 23, 'polio0_sampled')
calculate_vaccination_coverage(df, 12, 23, 'polio1_sampled')
calculate_vaccination_coverage(df, 12, 23, 'polio3_sampled')
calculate_vaccination_coverage(df, 12, 23, 'mcv_sampled')

bcg_sampled Vaccination Coverage for children between 12 to 23 months for Niger: 87.33%
dpt1_sampled Vaccination Coverage for children between 12 to 23 months for Niger: 88.64%
dpt3_sampled Vaccination Coverage for children between 12 to 23 months for Niger: 72.32%
polio0_sampled Vaccination Coverage for children between 12 to 23 months for Niger: 62.69%
polio1_sampled Vaccination Coverage for children between 12 to 23 months for Niger: 95.58%
polio3_sampled Vaccination Coverage for children between 12 to 23 months for Niger: 78.46%
measles_sampled Vaccination Coverage for children between 12 to 23 months for Niger: 71.71%


71.71460029325507

In [23]:
calculate_vaccination_coverage(df, 12, 23, 'bcg')
calculate_vaccination_coverage(df, 12, 23, 'dpt1')
calculate_vaccination_coverage(df, 12, 23, 'dpt3')
calculate_vaccination_coverage(df, 12, 23, 'polio0')
calculate_vaccination_coverage(df, 12, 23, 'polio1')
calculate_vaccination_coverage(df, 12, 23, 'polio3')
calculate_vaccination_coverage(df, 12, 23, 'mcv')

bcg Vaccination Coverage for children between 12 to 23 months for Niger: 84.46%
dpt1 Vaccination Coverage for children between 12 to 23 months for Niger: 84.65%
dpt3 Vaccination Coverage for children between 12 to 23 months for Niger: 70.87%
polio0 Vaccination Coverage for children between 12 to 23 months for Niger: 63.44%
polio1 Vaccination Coverage for children between 12 to 23 months for Niger: 90.13%
polio3 Vaccination Coverage for children between 12 to 23 months for Niger: 75.37%
measles Vaccination Coverage for children between 12 to 23 months for Niger: 70.67%


70.67448680351906

##### Calcualte vaccine coverage by child_gender

In [24]:
# List of vaccines
vaccines = ['bcg_sampled', 'dpt1_sampled', 'dpt2_sampled','dpt3_sampled','polio0_sampled','polio1_sampled','polio2_sampled','polio3_sampled','mcv_sampled']

In [25]:
# List of vaccines
vaccines1 = ['bcg', 'dpt1','dpt3','polio0','polio1','polio3','mcv']

In [26]:
filtered_df = df[(df['child_age'] >= 12) & (df['child_age'] <= 23)]

In [27]:
# List of child_genders
child_genders = ['Male','Female']
# Create an empty list to store the results
results = []

# Loop through each vaccine and child_gender
for vaccine in vaccines1:
    for child_gender in child_genders:
        coverage = calculate_vaccination_coverage_child_gender(filtered_df, 12, 23, child_gender, vaccine)
        results.append({'Vaccine': vaccine, 'child_gender': child_gender, 'Coverage': coverage})

# Convert the list of dictionaries into a DataFrame
results_df = pd.DataFrame(results)
results_df['Coverage'] = results_df['Coverage'].round(2)
# Display the results as a table
pivot_table1 = results_df.pivot_table(index='Vaccine', columns='child_gender', values='Coverage')
pivot_table1

child_gender,Female,Male
Vaccine,Unnamed: 1_level_1,Unnamed: 2_level_1
bcg,84.8,84.08
dpt1,85.39,85.07
dpt3,73.24,69.33
measles,70.7,71.22
polio0,63.98,62.86
polio1,90.3,91.6
polio3,77.07,77.45


##### Calcualte vaccine coverage by region

In [28]:
# List of region
regions = ["Agadez", "Diffa", "Dosso", "Maradi", "Tahoua", "Tillaberi", "Zinder", "Niamey"]
# Create an empty list to store the results
results1 = []

# Loop through each vaccine and child_gender
for vaccine in vaccines1:
    for region in regions:
        coverage = calculate_vaccination_coverage_region(filtered_df, 12, 23, region, vaccine)
        results1.append({'Vaccine': vaccine, 'region': region, 'Coverage': coverage})

# Convert the list of dictionaries into a DataFrame
results_df1 = pd.DataFrame(results1)
results_df1['Coverage'] = results_df1['Coverage'].round(2)
# Display the results as a table
pivot_table = results_df1.pivot_table(index='Vaccine', columns='region', values='Coverage')
pivot_table

region,Agadez,Diffa,Dosso,Maradi,Niamey,Tahoua,Tillaberi,Zinder
Vaccine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bcg,92.98,66.67,84.93,85.13,98.04,80.12,88.89,81.12
dpt1,89.29,63.75,91.1,87.11,94.12,77.5,89.63,85.31
dpt3,83.93,43.75,80.82,71.65,87.25,65.0,76.3,62.94
measles,80.7,54.22,78.08,68.91,87.25,63.12,75.56,65.03
polio0,78.95,44.05,67.81,60.0,92.16,51.55,66.67,58.74
polio1,91.23,67.86,95.74,91.19,97.03,90.68,91.85,94.37
polio3,89.47,47.62,87.05,79.06,86.14,73.86,77.04,75.36


##### Calcualte vaccine coverage by residence

In [29]:
df['residence'].value_counts()

residence
Rural    9802
Urban    2756
Name: count, dtype: int64

In [30]:
# List of region
residences = ["Urban", "Rural"]
# Create an empty list to store the results
results2 = []

# Loop through each vaccine and child_gender
for vaccine in vaccines1:
    for residence in residences:
        coverage = calculate_vaccination_coverage_residence(filtered_df, 12, 23,residence,vaccine)
        results2.append({'Vaccine': vaccine, 'residence': residence, 'Coverage': coverage})

# Convert the list of dictionaries into a DataFrame
results_df2 = pd.DataFrame(results2)
results_df2['Coverage'] = results_df2['Coverage'].round(2)
# Display the results as a table
pivot_table2 = results_df2.pivot_table(index='Vaccine', columns='residence', values='Coverage')
pivot_table2

residence,Rural,Urban
Vaccine,Unnamed: 1_level_1,Unnamed: 2_level_1
bcg,80.26,96.27
dpt1,82.09,94.03
dpt3,65.51,87.69
measles,66.71,82.84
polio0,54.44,88.81
polio1,89.3,95.49
polio3,74.04,86.09


##### Calcualte DPT retention rate per region

In [31]:
calculate_retention_rate(filtered_df, 'region', 'dpt1_sampled', 'dpt3_sampled')

region,Agadez,Diffa,Dosso,Maradi,Niamey,Tahoua,Tillaberi,Zinder
retention_rate,91.015353,69.635887,89.479449,80.070068,92.606389,85.464824,81.778348,70.667932


In [32]:
calculate_retention_rate(filtered_df, 'region', 'dpt1', 'dpt3')

region,Agadez,Diffa,Dosso,Maradi,Niamey,Tahoua,Tillaberi,Zinder
retention_rate,94.0,68.627451,88.721805,82.248521,92.708333,83.870968,85.123967,73.770492


##### Calcualte DPT retention rate per child_gender

In [33]:
calculate_retention_rate(filtered_df, 'child_gender', 'dpt1_sampled', 'dpt3_sampled')

child_gender,Female,Male
retention_rate,84.941396,77.852394


##### Calcualte DPT retention rate by residence

In [34]:
calculate_retention_rate(filtered_df, 'residence', 'dpt1_sampled', 'dpt3_sampled')

residence,Rural,Urban
retention_rate,79.031768,93.724838


##### Calcualte OPV retention rate per region

In [35]:
calculate_retention_rate(filtered_df, 'region', 'polio1', 'polio3')

region,Agadez,Diffa,Dosso,Maradi,Niamey,Tahoua,Tillaberi,Zinder
retention_rate,98.076923,70.175439,89.62963,85.795455,88.77551,77.39726,83.870968,77.61194


##### Calcualte DPT retention rate per child_gender

In [36]:
calculate_retention_rate(filtered_df, 'child_gender', 'polio1', 'polio3')

child_gender,Female,Male
retention_rate,84.210526,82.997763


##### Calcualte DPT retention rate by residence

In [37]:
calculate_retention_rate(filtered_df, 'residence','polio1', 'polio3')

residence,Rural,Urban
retention_rate,81.137725,90.15748


##### DPT retention NAtional

In [38]:
(1 - ((filtered_df['dpt1'].sum() - filtered_df['dpt3'].sum()) / filtered_df['dpt1'].sum())) * 100

83.71824480369516

##### Polio retention NAtional

In [39]:
(1 - ((filtered_df['polio1'].sum() - filtered_df['polio3'].sum()) / filtered_df['polio1'].sum())) * 100

83.62255965292842

##### Zero dose Childern

In [43]:
# Calculate the number of no vaccinated kids
no_vaccinated = filtered_df[(filtered_df['bcg']==0) & (filtered_df['polio1']==0) & (filtered_df['measles']==0) & (filtered_df['dpt1']==0)]
num_no_vaccinated = len(no_vaccinated)

# Calculate the total number of kids
total_kids =len(filtered_df)


# Calculate the percentage of no vaccinated kids
percentage_no_vaccinated = (num_no_vaccinated / total_kids) * 100
percentage_no_vaccinated

6.647116324535679

# Fully Vaccinated

In [44]:
# Calculate the number of fully vaccinated kids
fully_vaccinated = filtered_df[(filtered_df['bcg']==1) & (filtered_df['polio3']==1) & (filtered_df['measles']==1) & (filtered_df['dpt3']==1)]
num_fully_vaccinated = len(fully_vaccinated)

# Calculate the total number of kids
total_kids = len(filtered_df)

# Calculate the percentage of fully vaccinated kids
percentage_fully_vaccinated = (num_fully_vaccinated / total_kids) * 100
percentage_fully_vaccinated

55.32746823069403

## No of responders per region

In [None]:
grouped_data = df.groupby(['region'])['case_id'].count().sort_values()

# Plotting the bar chart with the specified size
plt.figure(figsize=(20, 6))  # Change the size of the plot

bars = plt.bar(grouped_data.index, grouped_data.values, color='skyblue')  # Set color as skyblue

# Adding totals above each bar
for bar, total in zip(bars, grouped_data.values):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.1, total, ha='center', va='bottom')

plt.title('Number of Cases per region')  # Change the title
plt.ylabel('Number of Cases')  # Change the y-axis label
plt.xlabel('region')  # Optional: Change the x-axis label

plt.show()

In [None]:
df_grouped = df.groupby(['region', 'residence'])['case_id'].count().reset_index().sort_values(by='case_id')

fig = px.bar(df_grouped, 
             x='region', y='case_id', color='residence',
             labels={'x': 'region', 'case_id': 'Number of Cases'}, 
             title='Cases per region by residence',
             category_orders={'region': df_grouped['region'].tolist()},
            )
fig.show()