# THAILAND 1960-2000

Group Ma Yinchu 

In [1]:
!pip install wbdata
!pip install cufflinks
!pip install gspread-pandas
!pip install plotly 

import plotly as plotly
import wbdata as wb
import pandas as pd
import numpy as np
import cufflinks as cf
import gspread_pandas as gsp




The Shapely GEOS version (3.10.2-CAPI-1.16.0) is incompatible with the GEOS version PyGEOS was compiled with (3.10.1-CAPI-1.16.0). Conversions between both will be slow.



**[#A] Population Statistics Function** 
A python function named population
that will deliver information to
answer the queries of the following
form:

In [year] how many
[people/males/females] aged [low] to
[high] were living in [the
world/region/country]?

In [2]:
def population_interpolated(year, sex, age_range, place, summary=False):
    '''
    Returns an integer specifying the total population of a given age range, male or female, 
    in a given year and place.  When the age bins specified are not the age bins 
    given by wbdata, populations are interpolated to create estimates for the age bins. 
    Although there is data for ages above 99 in wbdata, it is excluded to reduce the complexity of the function. 
    
            Parameters:
                    year (int): Year to get population totals from. 
                    sex (str): 'male' or 'female'  
                    age_range (tuple of ints): Age range slice. Can be the same year e.g. (2,2)
                                to get a single age. Must be between 0 and 99, inclusive. 
                        
                    place(str): Country code from wbdata, e.g. 'THA' or 'IND'
                    summary 

            Returns:
                    Integer of total population for a sex, age_range, and year   
    '''
    assert sex in ['male', 'female'], "sex must be 'male' or 'female'"
    assert 1960 < year < 2021, "Year must be between 1960 and 2021 (inclusive)"
    assert type(age_range) == tuple
    assert age_range[0] in range(0,100), "Age range must be between 0 and 99 (inclusive)"
    assert age_range[1] in range(0,100), "Age range must be between 0 and 99 (inclusive)"
    assert age_range[1] >= age_range[0], "The second age value must be higher or equal to the first"
    assert summary in [True, False], "summary must be True or False"
    
    # Convert sex variable to use in regex
    if sex == 'female':
        sex_re = 'FE'
    else:
        sex_re = 'MA'
    
    # Select only relevant indicators 
    indicator_40_df = pd.DataFrame(wb.get_indicator(source='40', cache=False))
    gender_bins = indicator_40_df[indicator_40_df['id'].str.fullmatch('SP.POP.*.' + sex_re)]
    indicator_dict = dict(zip(gender_bins.id, gender_bins.name))
    
    # Select a range of specified age bins 
    start = int(np.floor(age_range[0]/5))
    stop = int(np.floor(age_range[1]/5))
    gender_slice = dict(list(indicator_dict.items())[start:stop+1])

    # Generate DataFrame using only specified age range 
    year = pd.to_datetime(year, format='%Y')
    df = wb.get_dataframe(
        indicators=gender_slice, 
        country=place, 
        data_date = year, 
        cache=False,
        keep_levels=True)
    
    # Scale populatin bins according to the given age range
    # If an age range is given that doesn't match the predetermined bins, it is scaled to create an estimate.
    # For example, a valid bin is 0-4. If we want to get population estimates for only ages 2-4, then we simply 
    # take 3/4, or 75%, of the data from that bin. 
    
    # Edge case: if selecting range from a single bin, just calculate a fraction of the bin
    if len(df.columns) == 1: 
        end_scale = 1 
        if age_range[0] and age_range[1] > 79:
            start_scale = (age_range[1] - age_range[0] + 1) / 20
        else:
            start_scale = (age_range[1] - age_range[0] + 1) / 5
    
    # Edge case: if end range is in 80UP but start range is not 
    elif age_range[1] > 79: 
        start_scale = (5 - (age_range[0] % 5)) / 5
        end_scale = ((age_range[1] % 20) + 1) / 2
        
    # All other possible slices 
    else:
        start_scale = (5 - (age_range[0] % 5)) / 5
        end_scale = ((age_range[1] % 5) + 1) / 5 
    
    # Scale first and last bin
    df.iloc[:,0] = df.iloc[:,0] * start_scale 
    df.iloc[:, -1] = df.iloc[:, -1] * end_scale
    
    pop_sum = df.sum(axis=1)[0]
    
    if summary == True:
        return (f'In {year.year}, approximately {pop_sum} {sex}s'
               f' aged {age_range[0]} to {age_range[1]} were living in {place}'
       )
    else:
        return pop_sum

In [3]:
 population_interpolated(1990, 'female', (0,2), 'THA')

1573728.0

In [4]:
 population_interpolated(2000, 'male', (12,33), 'THA')

11956903.8

In [5]:
# Example Errors
population_interpolated(1990, 'mal', (0,2), 'THA')

AssertionError: sex must be 'male' or 'female'

**[#A] Population DataFrames** 

A function that returns a pandas DataFrame
indexed by Region or Country and Year, with
columns giving counts of people in different
age-sex groups.

In [6]:
def pop_df(start_year=1960, end_year=2021, country='all'):
    '''
    Returns pandas DataFrame of male and female population counts 
    in age bins from wbdata, indexed by country and year. 
    
            Parameters:
                    start_year (int): First year of the year range (inclusive)
                    end_year (int): End year of the year range (inclusive)
                    country (list): Country or list of countries to return. 
                                    Returns all countries if not specified.

            Returns:
                    df (pandas DataFrame): pandas DataFrame 
    '''
    # Select only relevant indicators
    indicator_40_df = pd.DataFrame(wb.get_indicator(source='40', cache=False))
    gender_bins = indicator_40_df[indicator_40_df['id'].str.fullmatch('SP.POP.*.(FE|MA)')]
    indicator_dict = dict(zip(gender_bins.id, gender_bins.name))
    
    # Construct dataframe with wbdata.get_dataframe() 
    start_year = pd.to_datetime(start_year, format='%Y')
    end_year = pd.to_datetime(end_year, format='%Y')
    df = wb.get_dataframe(
        indicators=indicator_dict, 
        country=country, 
        data_date = (start_year, end_year), 
        cache=False,
        keep_levels=True) 
    df = df.rename_axis(index=['country', 'year'])
    return df 

In [7]:
# Test: Populations from Thailand, Vietnam, and India from 1960-2001
tha_vnm_ind = pop_df(country = ['THA'], start_year=1960, end_year=2001)
tha_vnm_ind

Unnamed: 0_level_0,Unnamed: 1_level_0,"Population ages 00-04, female","Population ages 00-04, male","Population ages 05-09, female","Population ages 05-09, male","Population ages 10-14, female","Population ages 10-14, male","Population ages 15-19, female","Population ages 15-19, male","Population ages 20-24, female","Population ages 20-24, male",...,"Population ages 60-64, female","Population ages 60-64, male","Population ages 65-69, female","Population ages 65-69, male","Population ages 70-74, female","Population ages 70-74, male","Population ages 75-79, female","Population ages 75-79, male","Population ages 80 and above, female","Population ages 80 and above, male"
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Thailand,2001,2226953.0,2355033.0,2460746.0,2575952.0,2575554.0,2661364.0,2662085.0,2750301.0,2633613.0,2696171.0,...,1120083.0,987269.0,902778.0,776102.0,662643.0,567647.0,408717.0,336294.0,387543.0,261794.0
Thailand,2000,2281245.0,2412779.0,2493524.0,2608519.0,2604471.0,2695994.0,2675659.0,2775085.0,2625516.0,2686382.0,...,1107322.0,975829.0,868901.0,752578.0,634069.0,548046.0,383838.0,314540.0,366617.0,243241.0
Thailand,1999,2332574.0,2464916.0,2520335.0,2630954.0,2626982.0,2720857.0,2699849.0,2797557.0,2697522.0,2745341.0,...,1084377.0,955777.0,834456.0,728516.0,599693.0,521842.0,358744.0,292768.0,354588.0,234778.0
Thailand,1998,2385913.0,2518709.0,2538032.0,2644882.0,2653337.0,2751604.0,2733467.0,2828069.0,2768897.0,2809448.0,...,1050309.0,928094.0,802753.0,707757.0,563103.0,492066.0,337990.0,274160.0,342166.0,225698.0
Thailand,1997,2435971.0,2567846.0,2554397.0,2658681.0,2684622.0,2786640.0,2784635.0,2873911.0,2830817.0,2872522.0,...,1009589.0,896285.0,773300.0,688942.0,526256.0,460332.0,321376.0,258674.0,329081.0,215644.0
Thailand,1996,2476362.0,2603928.0,2577117.0,2679435.0,2725370.0,2827549.0,2855895.0,2938171.0,2878902.0,2930383.0,...,970179.0,866338.0,744321.0,669245.0,492568.0,429954.0,307870.0,245524.0,315364.0,204419.0
Thailand,1995,2503412.0,2621863.0,2611261.0,2710740.0,2782849.0,2880099.0,2940394.0,3016471.0,2910788.0,2978949.0,...,937133.0,841811.0,714516.0,646610.0,464425.0,403390.0,296470.0,233864.0,300973.0,191957.0
Thailand,1994,2530546.0,2646857.0,2640703.0,2743789.0,2819306.0,2917298.0,2957567.0,3036604.0,2912101.0,2984312.0,...,900855.0,816291.0,676401.0,616677.0,436085.0,377967.0,282953.0,222386.0,296270.0,189217.0
Thailand,1993,2556550.0,2667130.0,2675532.0,2780620.0,2862663.0,2957089.0,2977636.0,3058857.0,2917850.0,2990484.0,...,869718.0,795431.0,637465.0,583500.0,413324.0,356389.0,272719.0,213937.0,290571.0,184393.0
Thailand,1992,2581035.0,2685278.0,2714601.0,2819961.0,2907590.0,2996960.0,2999608.0,3083443.0,2921184.0,2991746.0,...,841061.0,776850.0,598697.0,548558.0,394786.0,337932.0,265354.0,208152.0,282992.0,177175.0


**[#B] Population Pyramids**

A python function that takes as
input a pandas DataFrame with
columns providing counts of people by
age-sex groups, and constructs a
“population pyramid” graph for
visualizing the data.

**[#C] Animated Population Pyramids**

A python function that takes as
input a pandas DataFrame with
columns providing counts of people by
age-sex groups, with rows
corresponding to different years, and
constructs an animated “population
pyramid” graph for visualizing how the
population changes over time.

In [8]:
test_pop = tha_vnm_ind

In [9]:
test_pop.loc['Thailand', str(1985),:].filter(regex=" male").values[0]

array([2925125., 3089196., 3201196., 3072549., 2598847., 2234400.,
       1867711., 1486698., 1214468., 1120708.,  925265.,  750135.,
        530116.,  369951.,  262114.,  163805.,  125518.])

In [10]:
test_pop.loc['Thailand', str(1985),:].filter(regex=" female").values[0]

array([2810956., 2990843., 3097928., 2994646., 2563821., 2278807.,
       1950794., 1541963., 1235512., 1170056.,  974433.,  772736.,
        555613.,  414153.,  312950.,  223068.,  200820.])

In [11]:
def PopulationPyramid(pop_data, country, year):
    '''
    Adapted from Population Lecture 1

    Description: Constructs a "population pyramid" graph to visualize population data

    Parameters: 
        pop_data: dataframe with columns providing counts of people by age-sex groups
        country: country of interest (string)
        year: year of interest (integer)
    '''
    
    # Import everything
    import plotly.offline as py
    import plotly.graph_objs as go
    import pandas as pd
    import numpy as np

    # Initialize notebook 
    py.init_notebook_mode(connected=True)
    
    # We construct a list of age ranges
    # Ranges top out at 80, and go in five year increments
    age_ranges = np.arange(1,82,5)
    
    # Plot the counts of age ranges for specified year
    # May have to change based on the names of columns in pop_data
    layout = go.Layout(barmode='overlay',
        yaxis=go.layout.YAxis(range=[0, 90], title='Age'),
        xaxis=go.layout.XAxis(title='Number'))
    
    bins = [go.Bar(x = pop_data.loc[country, str(year),:].filter(regex=" male").values[0],
        y=age_ranges,
        orientation='h',
        name='Men',
        marker=dict(color='purple'),
        hoverinfo='skip'
        ),
            
    go.Bar(x = -pop_data.loc[country, str(year),:].filter(regex=" female").values[0],
        y=age_ranges,
        orientation='h',
        name='Women',
        marker=dict(color='pink'),
        hoverinfo='skip',
        )
    ]
    
    py.iplot(dict(data=bins, layout=layout))

In [12]:
# Test: Populations from Thailand in 1990
my_pop = pop_df(country = ['THA'], start_year=1960, end_year=2001)
PopulationPyramid(my_pop, 'Thailand', 1990)

In [13]:
def AnimatedPyramid(pop_data, country, start_year, end_year):
    '''
    Adapted from Population Lecture 1

    Description: Constructs an "animated population pyramid" graph to visualize population data
        overtime, within a specified range of years (inclusive)

    Parameters: 
        pop_data: dataframe with columns providing counts of people by age-sex groups
        country: country of interest (string)
        start_year: starting year (integer)
        end_year: ending year (integer)
    '''
    
    # Import everything
    import plotly.offline as py
    import plotly.graph_objs as go
    import pandas as pd
    import numpy as np
    from ipywidgets import interactive, HBox, VBox

    # Initialize notebook 
    py.init_notebook_mode(connected=True)
    
    # We construct a list of age ranges
    # Ranges top out at 80, and go in five year increments
    age_ranges = np.arange(1,82,5)
    
    # Define the range of years
    years = np.arange(start_year, end_year+1)
    
    # Plot the counts of age ranges for specified year    
    bins = [go.Bar(x = pop_data.loc[country, str(start_year),:].filter(regex=" male").values[0],
        y=age_ranges,
        orientation='h',
        name='Men',
        marker=dict(color='purple'),
        hoverinfo='skip'
        ),
            
    go.Bar(x = -pop_data.loc[country, str(start_year),:].filter(regex=" female").values[0],
        y=age_ranges,
        orientation='h',
        name='Women',
        marker=dict(color='pink'),
        hoverinfo='skip',
        )
    ]
    
    layout = go.Layout(barmode='overlay',
        yaxis=go.layout.YAxis(range=[0, 90], title='Age'),
        xaxis=go.layout.XAxis(title='Number'))
    
    f = go.FigureWidget(data=bins,layout=layout)
        
    def update_count(year): # y = current year.
        f.data[0].x = pop_data.loc[country, str(year),:].filter(regex=" male").values[0]
        f.data[1].x = -pop_data.loc[country, str(year),:].filter(regex=" female").values[0]
        f.data[0].y = age_ranges
        f.data[1].y = age_ranges

    year_slider = interactive(update_count,year=(start_year,end_year,1))
    vb = VBox((f, year_slider))
    vb.layout.align_items = 'center'
    return vb

In [14]:
# Populations from Thailand in 1960-2000

my_pop = pop_df(country = ['THA'], start_year=1960, end_year=2001)
AnimatedPyramid(my_pop, 'Thailand', 1960, 2000)

VBox(children=(FigureWidget({
    'data': [{'hoverinfo': 'skip',
              'marker': {'color': 'purple'},
…

**[#C] Agricultural Supply** 

Relate changes in agricultural food supply
to changes in population.

In [20]:
!pip install eep153_tools
!pip install python_gnupg

#from eep153_tools.sheets import decrypt_credentials
#decrypt_credentials('../students.json.gpg')
# Only have to run this code once 



In [21]:
!ls ~/.eep153.service_accounts/

students@eep153.iam.gserviceaccount.com


In [22]:
from eep153_tools.sheets import read_sheets

#### Read a bunch of google worksheets into a dictionary of dataframes
data = read_sheets('https://docs.google.com/spreadsheets/d/1DLn9owcS7ggojJGWlI9vKSz0hqozn6cbcqNGWgzMZ8k',force_numeric=False)

data.keys()

#data['Output'].head() As a check 

Key available for students@eep153.iam.gserviceaccount.com.


dict_keys(['Explanation', 'Ag TFP', 'Output', 'Inputs', 'Factor Shares', 'Ag Land', 'Land Weights', 'Cropland', 'Irrig', 'Pasture', 'Labor', 'Livestock', 'Machinery', 'Fertilizer', 'Feed'])

In [23]:
import pandas as pd

def get_international_ag_productivity_data(series):

    columns=series.iloc[1,:]  # Need to fix columns
    series = series.iloc[2:,:]
    series.columns = columns

    # Set index to country code
    series = series.set_index('WDI Code')

    series = series[['%d' % t for t in range(1961,2020)]]

    series = series.replace({',':''},regex=True) # Get rid of commas in number strings
    series = series.replace({'':'NaN'}) # Change empty cells to NaN strings
    series = series.apply(lambda x: pd.to_numeric(x,errors='ignore'))
    series = series.astype(float) # Convert to floats

    series = series.stack()
    
    series = series.loc[~series.index.duplicated(keep='first')] 

    series.index.names = ['WDI Code','Year']

    return series
   

Data = ['Output','Ag TFP','Ag Land','Irrig','Pasture','Labor','Livestock',
        'Machinery','Fertilizer','Feed']

D = {}
for key in Data:
    D[key] = get_international_ag_productivity_data(data[key])

df = pd.DataFrame(D)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Output,Ag TFP,Ag Land,Irrig,Pasture,Labor,Livestock,Machinery,Fertilizer,Feed
WDI Code,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
,1961,441067.0,111.0,184.0,4.0,56.0,31.0,263.0,1692.0,2358.0,134016.0
,1962,449727.0,113.0,184.0,4.0,56.0,30.0,252.0,1740.0,2496.0,132329.0
,1963,455937.0,111.0,189.0,4.0,54.0,30.0,259.0,1890.0,2814.0,157105.0
,1964,441651.0,108.0,187.0,4.0,59.0,30.0,253.0,1976.0,3327.0,159653.0
,1965,484620.0,121.0,188.0,5.0,61.0,29.0,246.0,1679.0,3537.0,140357.0
...,...,...,...,...,...,...,...,...,...,...,...
ZWE,2015,1419365.0,97.0,2460.0,175.0,12100.0,4877.0,5278.0,31350.0,91600.0,1318934.0
ZWE,2016,1515900.0,91.0,2966.0,175.0,12100.0,5023.0,5931.0,32911.0,130000.0,1501953.0
ZWE,2017,,,,,,5169.0,,,,
ZWE,2018,,,,,,5330.0,,,,


In [26]:
# Graph for all countries Ag. 
cf.go_offline()

df['Output'].unstack().T.iplot(title="Value of Agricultural Output",
                               yTitle='Thousands of 2005-06 Dollars',
                               xTitle='Year')

#double click THA for Thailand's data 

In [27]:
thailand = df.xs('THA',level='WDI Code').replace(0,np.nan).dropna(how='any')

# Put in log differences
dthailand = np.log(thailand).diff()
dthailand['Inputs'] = dthailand['Output'] - dthailand['Ag TFP']

dthailand.mean()


Output        0.029300
Ag TFP        0.015159
Ag Land       0.015397
Irrig         0.025011
Pasture       0.020435
Labor         0.003218
Livestock     0.000508
Machinery     0.075138
Fertilizer    0.085982
Feed          0.049078
Inputs        0.014142
dtype: float64

In [28]:
dthailand[['Output','Inputs','Ag TFP']].iplot(title="Growth rates of output, inputs, & TFP",
                                           xTitle="Year")