# Preparation of Population

### Population by Age 2020.xlsx
This notebook prepares the population data from `Population by Age 2020.xlsx`.

In [1]:
# Load required packages
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os

## Load data

In [3]:
# Read in data frame
population = pd.read_excel("../../0_raw_data/public_data/Population by Age 2020.xlsx")

# Look at entire data frame
population

Unnamed: 0,"Population 1 November by region, age, sex and year",Unnamed: 1,Unnamed: 2,Unnamed: 3
0,,,,
1,,,,2020.0
2,01 Stockholm county,0-4 years,men,71910.0
3,,,women,68712.0
4,,5-14 years,men,153143.0
...,...,...,...,...
545,Database:,,,
546,Statistical database,,,
547,,,,
548,Internal reference code:,,,


## Preparatory steps

In [4]:
# Remove all rows with index 0-1, > 505
population = population.loc[2:505, :]

In [5]:
# Rename all columns
population = population.rename(columns = {"Population 1 November by region, age, sex and year": "region", "Unnamed: 1": "age", 
                                         "Unnamed: 2": "sex", "Unnamed: 3": "number"})

In [6]:
# Cast to appropriate data type
population["region"] = population["region"].astype('category')
population["age"] = population["age"].astype('category')
population["sex"] = population["sex"].astype('category')
population["number"] = population["number"].astype('float')

In [7]:
# Fill forward the columns 'region' and 'age'
population = population.ffill()

In [8]:
population = population.reset_index(drop = True)
population

Unnamed: 0,region,age,sex,number
0,01 Stockholm county,0-4 years,men,71910.0
1,01 Stockholm county,0-4 years,women,68712.0
2,01 Stockholm county,5-14 years,men,153143.0
3,01 Stockholm county,5-14 years,women,143376.0
4,01 Stockholm county,15-24 years,men,133050.0
...,...,...,...,...
499,25 Norrbotten county,85-94 years,women,4498.0
500,25 Norrbotten county,95+ years,men,132.0
501,25 Norrbotten county,95+ years,women,353.0
502,25 Norrbotten county,data not available,men,0.0


The region names do **not** coincide with the ones from the tables *patients* or *sales*.

In [9]:
# Save the prepared data frame
route0 = "../processed_data"

if not os.path.exists(route0):
    os.mkdir(route0)
    
print("saving file corresponding to population.pkl")
population.to_pickle(f"{route0}/population.pkl")
pd.read_pickle(f"{route0}/population.pkl")

saving file corresponding to population.pkl


Unnamed: 0,region,age,sex,number
0,01 Stockholm county,0-4 years,men,71910.0
1,01 Stockholm county,0-4 years,women,68712.0
2,01 Stockholm county,5-14 years,men,153143.0
3,01 Stockholm county,5-14 years,women,143376.0
4,01 Stockholm county,15-24 years,men,133050.0
...,...,...,...,...
499,25 Norrbotten county,85-94 years,women,4498.0
500,25 Norrbotten county,95+ years,men,132.0
501,25 Norrbotten county,95+ years,women,353.0
502,25 Norrbotten county,data not available,men,0.0
