# Project 1
The following code provides the workflow, functions, analysis and insights into Project 1 for group Ma Yinchu

### Data Setup

In [9]:
import pandas as pd
import numpy as np

In [10]:
!pip install wbdata



In [15]:
import wbdata

In [16]:
#wbdata.get_topics()
#wbdata.get_sources()

In [25]:
indicators = wbdata.get_indicators(source=40)
#indicators

### Deliverable 1 Population Statistics

In [26]:
sexdict = {'Male':'MA', 'Female':'FE'}
def population(year,sex,age_range,place):
    lower, upper = age_range[0], age_range[1]
    assert upper >= lower, "Invalid date range!" # Part of Deliverable 2
    assert sex in ["Male", "Female", "Both"], "Invalid sex Please use one of the following: Male, Female, Both"
    if sex == "Both":
        return population(year,"Male",age_range,place) + population(year,"Female",age_range,place)
    if upper > 80:
        upper = 'UP'
    if upper == lower + 4:
        label = f'SP.POP.{lower:02d}{upper}.{sexdict[sex]}.IN'
        df = wbdata.get_dataframe(label,country=place,parse_dates=True)
    else:
        yrs = [(x, x+4) for x in range(lower, upper, 5)]
        ranges = [f'{x[0]:02d}{x[1]:02d}' for x in yrs]
        var = [f"SP.POP.{x}.{sexdict[sex]}" for x in ranges]
        feed = {x: f"{sex} {ranges[var.index(x)]}" for x in var} # constructs variable dictionary to be fed to wbdata
        
        df = wbdata.get_dataframe(feed,country=place,parse_dates=True).reset_index()
        df = df[df['date'] == f'{year}-01-01'] # selecting relevant year
        df.drop('date', axis=1, inplace=True)
        extra = ((yrs[-1][1] - upper) / 5) * (df.iloc[:,-1].to_list()[0]) # controls for age ranges not on bounds, could change to 
                                                                          # year by year subtraction)
    sum = df.sum(axis=1).iloc[0]
    return (sum - extra)

In [27]:
population(1990, 'Male', (10,33), 'CHN')

263801536.6

### Deliverable 3 Population Data Frames

In [28]:
def population_data_frames(year, sex, age_range, place):
    lower, upper = age_range[0], age_range[1]
    assert upper >= lower, "Invalid date range!" #Part of Deliverable 2
    assert sex in ["Male", "Female", "Both"], "Invalid sex Please use one of the following: Male, Female, Both"
    if sex == "Both":
        return population_data_frames(year,"Male",age_range,place).merge(population_data_frames(year,"Female",age_range,place), on=["date", "country"])
    if upper > 80:
        upper = 'UP'
    if upper == lower + 4:
        label = f'SP.POP.{lower:02d}{upper}.{sexdict[sex]}.IN'
        df = wbdata.get_dataframe(label,country=place,parse_dates=True)
    else:
        yrs = [(x, x+4) for x in range(lower, upper, 5)]
        ranges = [f'{x[0]:02d}{x[1]:02d}' for x in yrs]
        var = [f"SP.POP.{x}.{sexdict[sex]}" for x in ranges]
        feed = {x: f"{sex} {ranges[var.index(x)]}" for x in var} # constructs variable dictionary to be fed to wbdata
        
        df = wbdata.get_dataframe(feed,country=place,parse_dates=True).reset_index()
        df['country'] = place
        df.set_index(['country', 'date'], inplace=True)
    return df

In [29]:
population_data_frames(1990, 'Male', (10,33), 'CHN')

Unnamed: 0_level_0,Unnamed: 1_level_0,Male 1014,Male 1519,Male 2024,Male 2529,Male 3034
country,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CHN,2023-01-01,47697475.0,43619845.0,42690196.0,45996968.0,58571312.0
CHN,2022-01-01,46945694.0,42880142.0,43079433.0,47322938.0,61347262.0
CHN,2021-01-01,46077575.0,42441714.0,43637348.0,48943931.0,63431979.0
CHN,2020-01-01,45390757.0,42444894.0,44208685.0,51917714.0,63016755.0
CHN,2019-01-01,44534610.0,42626158.0,45014490.0,55706672.0,60783908.0
CHN,...,...,...,...,...,...
CHN,1964-01-01,45210547.0,32772310.0,27179605.0,26566937.0,24330081.0
CHN,1963-01-01,43265071.0,31245247.0,27269359.0,26878889.0,23871048.0
CHN,1962-01-01,40877563.0,29902774.0,27482213.0,26824171.0,23384228.0
CHN,1961-01-01,38596810.0,29125584.0,27700640.0,26562875.0,23028989.0


In [30]:
population_data_frames(1990, 'Both', (10,33), 'CHN')

Unnamed: 0_level_0,Unnamed: 1_level_0,Male 1014,Male 1519,Male 2024,Male 2529,Male 3034,Female 1014,Female 1519,Female 2024,Female 2529,Female 3034
date,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2023-01-01,CHN,47697475.0,43619845.0,42690196.0,45996968.0,58571312.0,40892243.0,37182969.0,36605841.0,40184117.0,52861422.0
2022-01-01,CHN,46945694.0,42880142.0,43079433.0,47322938.0,61347262.0,40161573.0,36559731.0,37035577.0,41562725.0,55702933.0
2021-01-01,CHN,46077575.0,42441714.0,43637348.0,48943931.0,63431979.0,39352043.0,36210584.0,37629021.0,43238213.0,57929324.0
2020-01-01,CHN,45390757.0,42444894.0,44208685.0,51917714.0,63016755.0,38718791.0,36255224.0,38255106.0,46168527.0,57874577.0
2019-01-01,CHN,44534610.0,42626158.0,45014490.0,55706672.0,60783908.0,37959894.0,36469477.0,39112543.0,49873080.0,56135406.0
...,...,...,...,...,...,...,...,...,...,...,...
1964-01-01,CHN,45210547.0,32772310.0,27179605.0,26566937.0,24330081.0,43236900.0,30062519.0,24208269.0,23499927.0,22007969.0
1963-01-01,CHN,43265071.0,31245247.0,27269359.0,26878889.0,23871048.0,41174542.0,28433763.0,24118305.0,23923846.0,21660173.0
1962-01-01,CHN,40877563.0,29902774.0,27482213.0,26824171.0,23384228.0,38550727.0,27084802.0,24120873.0,24034501.0,21287773.0
1961-01-01,CHN,38596810.0,29125584.0,27700640.0,26562875.0,23028989.0,35990952.0,26262486.0,24208928.0,23901128.0,21019399.0


### Deliverable 2 Unit Tests

In [33]:
assert population(year=2000,sex='Male',age_range=(0,100),place='WLD') > 0.0000000000000000001,"Too few males!"
assert len(population_data_frames(year=2000,sex='Both',age_range=(0,31),place='WLD').columns) == 2 * len(population_data_frames(year=2000,sex='Female',age_range=(0,31),place='WLD').columns)


TypeError: 'str' object cannot be interpreted as an integer

### Deliverable 4 Population Pyramids