In [None]:
!pip install wbdata

In [15]:
import wbdata as wb
import pandas as pd
import numpy as np

In [2]:
def population_interpolated(year, sex, age_range, place, summary=False):
    '''
    Returns an integer specifying the total population of a given age range, male or female, 
    in a given year and place.  When the age bins specified are not the age bins 
    given by wbdata, populations are interpolated to create estimates for the age bins. 
    Although there is data for ages above 99 in wbdata, it is excluded to reduce the complexity of the function. 
    
            Parameters:
                    year (int): Year to get population totals from. 
                    sex (str): 'male' or 'female'  
                    age_range (tuple of ints): Age range slice. Can be the same year e.g. (2,2)
                                to get a single age. Must be between 0 and 99, inclusive. 
                        
                    place(str): Country code from wbdata, e.g. 'THA' or 'IND'
                    summary 

            Returns:
                    Integer of total population for a sex, age_range, and year   
    '''
    assert sex in ['male', 'female'], "sex must be 'male' or 'female'"
    assert 1960 < year < 2021, "Year must be between 1960 and 2021 (inclusive)"
    assert type(age_range) == tuple
    assert age_range[0] in range(0,100), "Age range must be between 0 and 99 (inclusive)"
    assert age_range[1] in range(0,100), "Age range must be between 0 and 99 (inclusive)"
    assert age_range[1] >= age_range[0], "The second age value must be higher or equal to the first"
    assert summary in [True, False], "summary must be True or False"
    
    # Convert sex variable to use in regex
    if sex == 'female':
        sex_re = 'FE'
    else:
        sex_re = 'MA'
    
    # Select only relevant indicators 
    indicator_40_df = pd.DataFrame(wb.get_indicator(source='40', cache=False))
    gender_bins = indicator_40_df[indicator_40_df['id'].str.fullmatch('SP.POP.*.' + sex_re)]
    indicator_dict = dict(zip(gender_bins.id, gender_bins.name))
    
    # Select a range of specified age bins 
    start = int(np.floor(age_range[0]/5))
    stop = int(np.floor(age_range[1]/5))
    gender_slice = dict(list(indicator_dict.items())[start:stop+1])

    # Generate DataFrame using only specified age range 
    year = pd.to_datetime(year, format='%Y')
    df = wb.get_dataframe(
        indicators=gender_slice, 
        country=place, 
        data_date = year, 
        cache=False,
        keep_levels=True)
    
    # Scale populatin bins according to the given age range
    # If an age range is given that doesn't match the predetermined bins, it is scaled to create an estimate.
    # For example, a valid bin is 0-4. If we want to get population estimates for only ages 2-4, then we simply 
    # take 3/4, or 75%, of the data from that bin. 
    
    # Edge case: if selecting range from a single bin, just calculate a fraction of the bin
    if len(df.columns) == 1: 
        end_scale = 1 
        if age_range[0] and age_range[1] > 79:
            start_scale = (age_range[1] - age_range[0] + 1) / 20
        else:
            start_scale = (age_range[1] - age_range[0] + 1) / 5
    
    # Edge case: if end range is in 80UP but start range is not 
    elif age_range[1] > 79: 
        start_scale = (5 - (age_range[0] % 5)) / 5
        end_scale = ((age_range[1] % 20) + 1) / 2
        
    # All other possible slices 
    else:
        start_scale = (5 - (age_range[0] % 5)) / 5
        end_scale = ((age_range[1] % 5) + 1) / 5 
    
    # Scale first and last bin
    df.iloc[:,0] = df.iloc[:,0] * start_scale 
    df.iloc[:, -1] = df.iloc[:, -1] * end_scale
    
    pop_sum = df.sum(axis=1)[0]
    
    if summary == True:
        return (f'In {year.year}, approximately {pop_sum} {sex}s'
               f' aged {age_range[0]} to {age_range[1]} were living in {place}'
        )
    else:
        return pop_sum

In [13]:
# Example: Female population ages 0-2, Thailand, 1990
population_interpolated(1990, 'female', (0,2), 'THA')

1573728.0

In [7]:
def pop_df(start_year=1960, end_year=2021, country='all'):
    '''
    Returns pandas DataFrame of male and female population counts 
    in age bins from wbdata, indexed by country and year. 
    
            Parameters:
                    start_year (int): First year of the year range (inclusive)
                    end_year (int): End year of the year range (inclusive)
                    country (list): Country or list of countries to return. 
                                    Returns all countries if not specified.

            Returns:
                    df (pandas DataFrame): pandas DataFrame 
    '''
    # Select only relevant indicators
    indicator_40_df = pd.DataFrame(wb.get_indicator(source='40', cache=False))
    gender_bins = indicator_40_df[indicator_40_df['id'].str.fullmatch('SP.POP.*.(FE|MA)')]
    indicator_dict = dict(zip(gender_bins.id, gender_bins.name))
    
    # Construct dataframe with wbdata.get_dataframe() 
    start_year = pd.to_datetime(start_year, format='%Y')
    end_year = pd.to_datetime(end_year, format='%Y')
    df = wb.get_dataframe(
        indicators=indicator_dict, 
        country=country, 
        data_date = (start_year, end_year), 
        cache=False,
        keep_levels=True) 
    df = df.rename_axis(index=['country', 'year'])
    return df 

In [9]:
# Populations from Thailand, Vietnam, and India from 1960-2001
tha_vnm_ind = pop_df(country = ['THA'], start_year=1960, end_year=2001)
tha_vnm_ind.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,"Population ages 00-04, female","Population ages 00-04, male","Population ages 05-09, female","Population ages 05-09, male","Population ages 10-14, female","Population ages 10-14, male","Population ages 15-19, female","Population ages 15-19, male","Population ages 20-24, female","Population ages 20-24, male",...,"Population ages 60-64, female","Population ages 60-64, male","Population ages 65-69, female","Population ages 65-69, male","Population ages 70-74, female","Population ages 70-74, male","Population ages 75-79, female","Population ages 75-79, male","Population ages 80 and above, female","Population ages 80 and above, male"
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Thailand,2001,2226953.0,2355033.0,2460746.0,2575952.0,2575554.0,2661364.0,2662085.0,2750301.0,2633613.0,2696171.0,...,1120083.0,987269.0,902778.0,776102.0,662643.0,567647.0,408717.0,336294.0,387543.0,261794.0
Thailand,2000,2281245.0,2412779.0,2493524.0,2608519.0,2604471.0,2695994.0,2675659.0,2775085.0,2625516.0,2686382.0,...,1107322.0,975829.0,868901.0,752578.0,634069.0,548046.0,383838.0,314540.0,366617.0,243241.0
Thailand,1999,2332574.0,2464916.0,2520335.0,2630954.0,2626982.0,2720857.0,2699849.0,2797557.0,2697522.0,2745341.0,...,1084377.0,955777.0,834456.0,728516.0,599693.0,521842.0,358744.0,292768.0,354588.0,234778.0
Thailand,1998,2385913.0,2518709.0,2538032.0,2644882.0,2653337.0,2751604.0,2733467.0,2828069.0,2768897.0,2809448.0,...,1050309.0,928094.0,802753.0,707757.0,563103.0,492066.0,337990.0,274160.0,342166.0,225698.0
Thailand,1997,2435971.0,2567846.0,2554397.0,2658681.0,2684622.0,2786640.0,2784635.0,2873911.0,2830817.0,2872522.0,...,1009589.0,896285.0,773300.0,688942.0,526256.0,460332.0,321376.0,258674.0,329081.0,215644.0
