# Direct download from Eurostat

Browse: <https://ec.europa.eu/eurostat/web/population-demography/demography-population-stock-balance/database?node_code=demomwk>

In [1]:
"""
Set PYTHONPATH for modules in parallel directory

https://stackoverflow.com/questions/3108285/in-python-script-how-do-i-set-pythonpath/3108301

"""

import sys

try:
    sys.path.index('..') # Or os.getcwd() for this directory
except ValueError:
    sys.path.append('..') # Or os.getcwd() for this directory

#--------------------------------------------------------------
import pandas as pd
import eurostat
from share.pandashelpers import RawDf

In [2]:
# toc_df = eurostat.get_toc_df()
# eurostat.subset_toc_df(toc_df, 'Population on 1 January by age and sex')

In [3]:
df_raw = RawDf(eurostat.get_data_df('demo_pjan'))

df_raw.df.rename(columns={'geo\\time' : 'geo'}, inplace=True)
df_raw()

Unnamed: 0,unit,age,sex,geo,2021,2020,2019,2018,2017,2016,...,1969,1968,1967,1966,1965,1964,1963,1962,1961,1960
0,NR,TOTAL,F,AD,,,37388.0,,,,...,,,,,,,,,,
1,NR,TOTAL,F,AL,,1425342.0,1432833.0,1431715.0,1423050.0,1417141.0,...,,,,,,,,,,
2,NR,TOTAL,F,AM,,1562689.0,1563538.0,1564533.0,1567380.0,1569535.0,...,,,,,,,,,,
3,NR,TOTAL,F,AT,,4522292.0,4501742.0,4483749.0,4460424.0,4427918.0,...,3932691.0,3922359.0,3899799.0,3876559.0,3857760.0,3836415.0,3814191.0,3794130.0,3773097.0,3757167.0
4,NR,TOTAL,F,AZ,,5039100.0,4999053.0,4960058.0,4918771.0,4870002.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17524,NR,Y_OPEN,T,SM,,,,8.0,,,...,,,,,,,,,,
17525,NR,Y_OPEN,T,TR,,5567.0,174875.0,5416.0,145341.0,127986.0,...,97602.0,90125.0,83222.0,76850.0,70967.0,69029.0,67153.0,65335.0,63574.0,61869.0
17526,NR,Y_OPEN,T,UA,,11193.0,10365.0,8523.0,6839.0,6482.0,...,,,,,,,,,,
17527,NR,Y_OPEN,T,UK,,,13322.0,14469.0,14589.0,14407.0,...,427700.0,419900.0,408900.0,396300.0,400000.0,400000.0,400000.0,400000.0,300000.0,300000.0


In [4]:
Q = 'geo == "PL"'

df = df_raw().query(Q)[['age', 'sex', 'geo', 2020]]



sex_list = ['T', 'F', 'M']

def make_dfs_sex(df : pd.DataFrame) -> dict:
    dfs = {}
    for sex in sex_list:
        Q = 'sex == "'+sex+'"'
        dfs[sex] = df.query(Q)
    return dfs


dfs = make_dfs_sex(df)
dfs['T']

Unnamed: 0,age,sex,geo,2020
164,TOTAL,T,PL,37958138.0
335,UNK,T,PL,0.0
506,Y1,T,PL,388342.0
677,Y10,T,PL,428843.0
848,Y11,T,PL,426459.0
...,...,...,...,...
16844,Y97,T,PL,8569.0
17009,Y98,T,PL,5097.0
17174,Y99,T,PL,3168.0
17345,Y_LT1,T,PL,367274.0


In [42]:
def make_age_groups() -> dict:    
    age_groups = {}

    age_group = ['Y_LT1']
    start = 1
    end = 18
    age_group=[]
    age_group_name = f"0-{end}"
    for y in range(start,end+1):
        age_group.append(f"Y{y}")
    age_groups[age_group_name] = age_group

    start = 19
    end =24
    age_group_name = f"{start}-{end}"
    age_group=[]
    for y in range(start,end+1):
        age_group.append(f"Y{y}")
    # age_group_name = f"{start}-{end}"
    age_groups[age_group_name] = age_group

    start = 25
    end = 34
    while end < 95:
        age_group_name = f"{start}-{end}"
        age_group=[]
        for y in range(start,end+1):
            age_group.append(f"Y{y}")
        age_groups[age_group_name] = age_group
        start+=10
        end+=10
    
    start = 95
    end = 99
    age_group_name = f"{start}+"
    age_group=[]
    for y in range(start,end+1):
        age_group.append(f"Y{y}")
    age_group.append('Y_OPEN')
    age_groups[age_group_name] = age_group
    
    return age_groups


def population_in_age_group(df : pd.DataFrame, age_group: list) -> None:
    sum = 0
    for age in age_group:
        sum += df[df['age'] == age][2020].values[0]
    return sum

# def age_group_population_df(dfs : dict, age_group_dict : dict) -> pd.DataFrame:
#     row_list = []
#     for sex in sex_list:
#         for key in age_group_dict.keys():
#             population = population_in_age_group(dfs[sex], age_group_dict[key])
#             row_list.append([key,sex,population])
#     dfag = pd.DataFrame(row_list, columns = ['Age group','Sex', 'Population 2020'])
#     return(dfag)


def age_group_population_df(dfs : dict, age_group_dict : dict) -> pd.DataFrame:
    row_list = []
    
    for key in age_group_dict.keys():
        population = []
        for sex in sex_list:
            population.append(population_in_age_group(dfs[sex], age_group_dict[key]))
        row_list.append([key,*population])
    dfag = pd.DataFrame(row_list, columns = ['Age group', 'Population 2020 T', 'Population 2020 F', 'Population 2020 M'])
    return(dfag)

In [46]:
# age_group_dict = make_age_groups()
# age_group_dict

In [47]:
age_group_dict = make_age_groups()
dfag = age_group_population_df(dfs, age_group_dict)
dfag


Unnamed: 0,Age group,Population 2020 T,Population 2020 F,Population 2020 M
0,0-18,6909092.0,3362871.0,3546221.0
1,19-24,2391135.0,1169959.0,1221176.0
2,25-34,5378329.0,2637687.0,2740642.0
3,35-44,6101624.0,3008948.0,3092676.0
4,45-54,4789676.0,2395882.0,2393794.0
5,55-64,5104262.0,2668881.0,2435381.0
6,65-74,4195973.0,2362488.0,1833485.0
7,75-84,1912001.0,1218241.0,693760.0
8,85-94,756428.0,540453.0,215975.0
9,95+,52344.0,40781.0,11563.0


In [48]:
dfag.to_csv('./eurostat_data/Poland_2020_population.csv', index=None)