In [1]:
import pandas as pd
import json
from pathlib import Path

notebook_path = Path().resolve()
gdp_path = notebook_path / "data" / "gdp_per_capita.xlsx"
trade_path = notebook_path / "data" / "trade_by_region.xlsx"

gdp_df = pd.read_excel(gdp_path)
trade_df = pd.read_excel(trade_path)

In [2]:
from data_cleaning import ffill_col, pad_code, split_code_from_name, merge_with_final 

#### GDP per capita

In [3]:
gdp_df

Unnamed: 0,Region,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,WHOLE COUNTRY,26348.6,27874.5,28542.9,29102.9,30362.6,31388.3,32824.2,35369.4,36559.1,...,37414.6,37691.9,38359.5,39254.8,40794.7,42045.9,43197.2,42749.8,44895.0,47897.6
1,MA1 MAINLAND FINLAND,26298.0,27805.9,28477.7,29047.4,30308.2,31336.0,32773.7,35328.2,36528.7,...,37352.3,37626.8,38300.4,39206.3,40751.2,42025.2,43169.1,42752.0,44882.5,47874.7
2,SA1 Helsinki-Uusimaa,36270.1,38454.8,38552.7,38643.2,40312.2,41705.0,43989.1,47355.2,48910.2,...,49746.8,49522.7,51046.2,51914.0,53512.8,54982.0,56756.6,55424.5,57622.9,60375.8
3,SA2 Southern Finland,24510.8,25496.1,26535.2,26874.1,28274.5,28920.1,29837.2,31909.0,32610.7,...,32757.9,33324.9,33634.8,34555.8,36405.5,37394.7,38162.7,37852.5,39788.7,42355.0
4,SA3 Western Finland,22977.4,24934.3,25308.7,26021.4,26790.3,27939.4,28984.6,31658.4,32883.7,...,33489.3,33730.7,34192.0,34652.5,35861.8,37086.4,37870.4,37835.0,40081.9,42685.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,SK196 Tunturi-Lappi,15833.5,17318.9,17532.8,18249.6,19326.9,20719.5,21910.2,23931.1,27646.8,...,38063.4,33107.1,38561.2,42374.2,43613.5,42995.3,46392.0,51480.3,50834.7,58063.0
115,SK197 Pohjois-Lappi,16831.4,17697.7,18944.0,19436.5,20576.7,22876.0,23965.0,24254.0,26963.6,...,34416.5,37545.2,39502.1,39789.2,48466.2,51082.0,45736.6,49815.2,59296.8,69084.8
116,SK211 Mariehamns stad,55008.2,67201.7,63249.5,64894.8,66111.2,67262.4,64953.7,65050.8,61924.5,...,73320.6,77689.8,76942.9,74140.8,75018.3,69275.6,71537.6,56690.4,65761.7,79980.2
117,SK212 Ålands landsbygd,15005.7,16118.9,19551.3,19139.8,20030.9,20355.4,22774.3,24374.4,24964.3,...,30088.5,28278.3,28423.4,28626.2,28606.3,26375.7,28128.2,28584.5,31092.4,30625.0


Dataset gdp_df contains:
- Provinces
- Regions
- Well-being areas
- Sub-regions

We will only be using regions. (Perhaps it may make sense to use sub-regions in the future, if we see that municipalities are too small.)

In [4]:
gdp_df = gdp_df[gdp_df["Region"].astype(str).str.contains("MK")]
gdp_df

Unnamed: 0,Region,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
6,MK01 Uusimaa,36270.1,38454.8,38552.7,38643.2,40312.2,41705.0,43989.1,47355.2,48910.2,...,49746.8,49522.7,51046.2,51914.0,53512.8,54982.0,56756.6,55424.5,57622.9,60375.8
7,MK02 Southwest Finland,26094.0,26551.8,29570.2,29065.6,30627.2,31344.2,32924.9,36024.5,36529.7,...,33902.3,34387.2,34417.8,36103.0,38440.6,39544.4,40404.6,39752.0,41181.0,44629.6
8,MK04 Satakunta,23325.3,25814.2,26215.8,26399.7,26385.4,27884.1,29245.8,31059.4,32021.8,...,34143.3,35164.0,36178.0,34955.6,36167.8,36858.4,36564.9,37034.6,39248.0,42468.6
9,MK05 Kanta-Häme,20656.4,22253.3,22360.0,23708.2,25107.3,25497.5,26220.6,27819.6,30068.9,...,31740.2,31335.8,31491.3,32119.7,33124.4,33256.0,34667.5,35522.7,37967.0,38841.6
10,MK06 Pirkanmaa,24923.3,27338.9,28117.4,28831.7,30063.5,31234.4,31899.4,34830.6,36086.9,...,34972.6,34596.1,34608.1,35947.5,37247.0,38781.9,39863.9,40148.2,41910.2,44133.8
11,MK07 Päijät-Häme,20701.5,22123.4,21861.5,23141.2,24072.2,25006.1,25399.4,26594.8,28140.5,...,29858.5,30513.9,30755.5,31679.5,32139.8,32759.5,32905.6,32745.5,34467.9,37708.0
12,MK08 Kymenlaakso,27357.0,28665.5,28068.5,27953.3,30399.0,31270.8,31359.5,31679.1,30890.4,...,32115.4,32837.7,34903.2,34794.8,37371.4,39846.3,41287.3,41506.5,42936.9,44126.7
13,MK09 South Karelia,25954.6,26856.8,26612.6,27694.5,27903.7,27889.7,28703.2,31543.9,31602.5,...,35479.0,37266.2,36577.3,36455.6,38887.9,39219.9,38900.7,37378.9,41539.3,43646.0
14,MK10 South Savo,17678.1,18777.3,19360.0,20455.1,21581.9,22528.8,23271.5,25984.0,25908.8,...,28566.0,29834.7,29441.1,30397.6,31079.4,32249.7,32910.2,33034.8,34167.4,37847.9
15,MK11 North Savo,19493.5,20881.4,21324.0,22128.3,22665.7,24076.4,24992.9,27091.2,28875.0,...,30119.2,30842.8,31361.6,32053.9,33681.4,35220.2,36065.6,35762.4,37853.5,42213.7


Now, we restructure the dataset from the following...

Region | 2000 | 2001 | ... | 2021 | 2022
---|---|---|---|---|---
01 | 36270.1 | 38454.8 | ... | 57622.9 | 60375.8
... | ... | ... | ...| ... | ...
21 | 31352.2 | 37019.8 | ... | 44250.6 | 48912.8

so that it is fits the following format:

Region | Year | GDP per capita (euro at current prices)
---|---|---
01 | 2000 | 36270.1
01 | 2001 | 38454.8
... | ... | ... | ...
21 | 2022 | 48912.8

In [5]:
long_gdp_df = gdp_df.melt(
    id_vars=["Region"],          # keep Region as is
    var_name="Year",             # new column name for former column headers
    value_name="GDP per capita (euro at current prices)"  # name for values
)
long_gdp_df

Unnamed: 0,Region,Year,GDP per capita (euro at current prices)
0,MK01 Uusimaa,2000,36270.1
1,MK02 Southwest Finland,2000,26094.0
2,MK04 Satakunta,2000,23325.3
3,MK05 Kanta-Häme,2000,20656.4
4,MK06 Pirkanmaa,2000,24923.3
...,...,...,...
432,MK16 Central Ostrobothnia,2022,40369.2
433,MK17 North Ostrobothnia,2022,41666.7
434,MK18 Kainuu,2022,42125.5
435,MK19 Lapland,2022,46618.7


In [6]:
long_gdp_df = split_code_from_name(long_gdp_df, "Region", code_length=2)
long_gdp_df

Unnamed: 0,Region,Year,GDP per capita (euro at current prices)
0,01,2000,36270.1
1,02,2000,26094.0
2,04,2000,23325.3
3,05,2000,20656.4
4,06,2000,24923.3
...,...,...,...
432,16,2022,40369.2
433,17,2022,41666.7
434,18,2022,42125.5
435,19,2022,46618.7


### Trade

In [7]:
trade_df

Unnamed: 0,Year,Region,Flow,Cum. statistical value (euro) from the beginning of the year
0,202412,1 Uusimaa,Imports by countries of origin,42750000000
1,202412,1 Uusimaa,Exports by countries of destination,22418000000
2,202412,2 Varsinais-Suomi,Imports by countries of origin,4737000000
3,202412,2 Varsinais-Suomi,Exports by countries of destination,6718000000
4,202412,3 Satakunta,Imports by countries of origin,3332000000
...,...,...,...,...
415,201512,19 Ahvenanmaa,Exports by countries of destination,103000000
416,201512,20 Unknown,Imports by countries of origin,1226000000
417,201512,20 Unknown,Exports by countries of destination,429000000
418,201512,21 Total,Imports by countries of origin,54493000000


We restructure the dataset from...

Year | Region | Flow | Cum. statistical value (euro) from the beginning of the year
---|---|---|---
2024 | 1 Uusimaa | Imports by countries of origin | 42750000000.0
2024 | 1 Uusimaa | Exports by countries of origin | 22418000000.0
... | ... | ... | ...

To the following structure:

Region | Year | Imports (euro) | Exports (euro)
---|---|---|---
1 Uusimaa | 2024 | 42750000000.0 | 22418000000.0
... | ... | ... | ...


In [8]:
# Pivot wider
wide_trade_df = trade_df.pivot_table(
    index=["Region", "Year"],  # keep Region and Year
    columns="Flow",            # values in Flow become columns
    values="Cum. statistical value (euro) from the beginning of the year",
    aggfunc="sum"              # in case of duplicates
).reset_index()

wide_trade_df

Flow,Region,Year,Exports by countries of destination,Imports by countries of origin
0,1 Uusimaa,201512,16912000000,33474000000
1,1 Uusimaa,201612,15850000000,34441000000
2,1 Uusimaa,201712,18255000000,36814000000
3,1 Uusimaa,201812,18717000000,38838000000
4,1 Uusimaa,201912,19361000000,38416000000
...,...,...,...,...
205,9 Etelä-Savo,202012,398000000,286000000
206,9 Etelä-Savo,202112,478000000,311000000
207,9 Etelä-Savo,202212,516000000,316000000
208,9 Etelä-Savo,202312,497000000,325000000


In [9]:
wide_trade_df = wide_trade_df.rename(columns={
    "Imports by countries of origin": "Imports (euro)",
    "Exports by countries of destination": "Exports (euro)"
})

wide_trade_df = split_code_from_name(wide_trade_df, "Region", code_length=2)

wide_trade_df["Year"] = wide_trade_df["Year"].astype(str).str[:4]
wide_trade_df

Flow,Region,Year,Exports (euro),Imports (euro)
0,01,2015,16912000000,33474000000
1,01,2016,15850000000,34441000000
2,01,2017,18255000000,36814000000
3,01,2018,18717000000,38838000000
4,01,2019,19361000000,38416000000
...,...,...,...,...
205,09,2020,398000000,286000000
206,09,2021,478000000,311000000
207,09,2022,516000000,316000000
208,09,2023,497000000,325000000


### Merge GDP per capita and Trade

In [10]:
final_df = pd.merge(
    left=wide_trade_df,
    right=long_gdp_df,
    how="outer",
    on=["Region", "Year"]
)
final_df

Unnamed: 0,Region,Year,Exports (euro),Imports (euro),GDP per capita (euro at current prices)
0,01,2000,,,36270.1
1,01,2001,,,38454.8
2,01,2002,,,38552.7
3,01,2003,,,38643.2
4,01,2004,,,40312.2
...,...,...,...,...,...
490,21,2020,5.744000e+10,5.976800e+10,39532.8
491,21,2021,6.890800e+10,7.293600e+10,44250.6
492,21,2022,8.188500e+10,9.246900e+10,48912.8
493,21,2023,7.636200e+10,7.684900e+10,


In [11]:
# now add column "Municipality" to be filled with "TOTAL"

final_df["Municipality"] = "TOTAL"
final_df

Unnamed: 0,Region,Year,Exports (euro),Imports (euro),GDP per capita (euro at current prices),Municipality
0,01,2000,,,36270.1,TOTAL
1,01,2001,,,38454.8,TOTAL
2,01,2002,,,38552.7,TOTAL
3,01,2003,,,38643.2,TOTAL
4,01,2004,,,40312.2,TOTAL
...,...,...,...,...,...,...
490,21,2020,5.744000e+10,5.976800e+10,39532.8,TOTAL
491,21,2021,6.890800e+10,7.293600e+10,44250.6,TOTAL
492,21,2022,8.188500e+10,9.246900e+10,48912.8,TOTAL
493,21,2023,7.636200e+10,7.684900e+10,,TOTAL


In [12]:
# reorder columns
final_df = final_df[
    ["Region", "Municipality", "Year", "GDP per capita (euro at current prices)",
     "Imports (euro)", "Exports (euro)"]
]

final_df

Unnamed: 0,Region,Municipality,Year,GDP per capita (euro at current prices),Imports (euro),Exports (euro)
0,01,TOTAL,2000,36270.1,,
1,01,TOTAL,2001,38454.8,,
2,01,TOTAL,2002,38552.7,,
3,01,TOTAL,2003,38643.2,,
4,01,TOTAL,2004,40312.2,,
...,...,...,...,...,...,...
490,21,TOTAL,2020,39532.8,5.976800e+10,5.744000e+10
491,21,TOTAL,2021,44250.6,7.293600e+10,6.890800e+10
492,21,TOTAL,2022,48912.8,9.246900e+10,8.188500e+10
493,21,TOTAL,2023,,7.684900e+10,7.636200e+10


### Gross value added


In [13]:
import pandas as pd
from pathlib import Path

notebook_path = Path().resolve()
gross_value_path = notebook_path / "data" / "gross_value_added.xlsx"

gross_value_df = pd.read_excel(gross_value_path)
gross_value_df

Unnamed: 0,Region,Year,"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)","Gross value added (millions of euro), B Mining and quarrying (05-09)","Gross value added (millions of euro), F Construction (41-43)","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles (45-47)","Gross value added (millions of euro), H Transportation and storage (49-53)","Gross value added (millions of euro), I Accommodation and food service activities (55-56)","Gross value added (millions of euro), J Information and communication (58-63)","Gross value added (millions of euro), K Financial and insurance activities (64-66)","Gross value added (millions of euro), M Professional, scientific and technical activities (69-75)","Gross value added (millions of euro), N Administrative and support service activities (77-82)","Gross value added (millions of euro), O Public administration and defence; compulsory social security (84)","Gross value added (millions of euro), P Education (85)","Gross value added (millions of euro), Q Human health and social work activities (86-88)","Gross value added (millions of euro), R, S Other service activities (90-96)","Gross value added (millions of euro), T Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use(97-98)"
0,WHOLE COUNTRY,2000,4030.0,283,7344,11034,7356,1534,5735,3936,4454,2159,7039,5992,8761,2862,56
1,,2001,3998.0,330,7464,12018,7949,1831,6858,4214,4950,2418,7365,6340,9461,3086,53
2,,2002,3968.0,350,7254,12563,7997,1882,7324,3726,5020,2646,7593,6696,9991,3221,58
3,,2003,3848.0,375,7652,12735,7949,1927,7196,3503,5248,2796,7938,6993,10515,3352,78
4,,2004,3778.0,388,8329,13618,8026,2152,7811,3769,5640,3122,8277,7328,11092,3524,109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,,2019,30.2,1.6,84.7,74.8,235.3,27,51.6,67.6,29,12.1,50.1,66.3,138.1,36.9,0.9
476,,2020,31.3,1.5,89.4,63.4,103.7,17,52.2,72.6,29.1,9.9,47.7,67,143.8,38.4,1.1
477,,2021,29.3,1.7,94.3,73.5,137.5,28.8,58.9,84.4,29.1,11.7,55.5,72.8,154.1,54,1.3
478,,2022,21.7,1.7,83,76.5,218.9,33.7,53.9,109.4,29.7,13.9,64,77,166.3,15.6,1.1


In [14]:
# for rows where Year is "2023*", rename to "2023" and any values in the row that are "..." to NaN
gross_value_df["Year"] = gross_value_df["Year"].astype(str).str.replace("2023.*", "2023", regex=True)
gross_value_df = gross_value_df.replace("...", pd.NA)
gross_value_df

Unnamed: 0,Region,Year,"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)","Gross value added (millions of euro), B Mining and quarrying (05-09)","Gross value added (millions of euro), F Construction (41-43)","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles (45-47)","Gross value added (millions of euro), H Transportation and storage (49-53)","Gross value added (millions of euro), I Accommodation and food service activities (55-56)","Gross value added (millions of euro), J Information and communication (58-63)","Gross value added (millions of euro), K Financial and insurance activities (64-66)","Gross value added (millions of euro), M Professional, scientific and technical activities (69-75)","Gross value added (millions of euro), N Administrative and support service activities (77-82)","Gross value added (millions of euro), O Public administration and defence; compulsory social security (84)","Gross value added (millions of euro), P Education (85)","Gross value added (millions of euro), Q Human health and social work activities (86-88)","Gross value added (millions of euro), R, S Other service activities (90-96)","Gross value added (millions of euro), T Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use(97-98)"
0,WHOLE COUNTRY,2000,4030.0,283,7344,11034,7356,1534,5735,3936,4454,2159,7039,5992,8761,2862,56
1,,2001,3998.0,330,7464,12018,7949,1831,6858,4214,4950,2418,7365,6340,9461,3086,53
2,,2002,3968.0,350,7254,12563,7997,1882,7324,3726,5020,2646,7593,6696,9991,3221,58
3,,2003,3848.0,375,7652,12735,7949,1927,7196,3503,5248,2796,7938,6993,10515,3352,78
4,,2004,3778.0,388,8329,13618,8026,2152,7811,3769,5640,3122,8277,7328,11092,3524,109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,,2019,30.2,1.6,84.7,74.8,235.3,27,51.6,67.6,29,12.1,50.1,66.3,138.1,36.9,0.9
476,,2020,31.3,1.5,89.4,63.4,103.7,17,52.2,72.6,29.1,9.9,47.7,67,143.8,38.4,1.1
477,,2021,29.3,1.7,94.3,73.5,137.5,28.8,58.9,84.4,29.1,11.7,55.5,72.8,154.1,54,1.3
478,,2022,21.7,1.7,83,76.5,218.9,33.7,53.9,109.4,29.7,13.9,64,77,166.3,15.6,1.1


In [15]:
# ffill for Region
gross_value_df["Region"] = gross_value_df["Region"].ffill()
gross_value_df

Unnamed: 0,Region,Year,"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)","Gross value added (millions of euro), B Mining and quarrying (05-09)","Gross value added (millions of euro), F Construction (41-43)","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles (45-47)","Gross value added (millions of euro), H Transportation and storage (49-53)","Gross value added (millions of euro), I Accommodation and food service activities (55-56)","Gross value added (millions of euro), J Information and communication (58-63)","Gross value added (millions of euro), K Financial and insurance activities (64-66)","Gross value added (millions of euro), M Professional, scientific and technical activities (69-75)","Gross value added (millions of euro), N Administrative and support service activities (77-82)","Gross value added (millions of euro), O Public administration and defence; compulsory social security (84)","Gross value added (millions of euro), P Education (85)","Gross value added (millions of euro), Q Human health and social work activities (86-88)","Gross value added (millions of euro), R, S Other service activities (90-96)","Gross value added (millions of euro), T Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use(97-98)"
0,WHOLE COUNTRY,2000,4030.0,283,7344,11034,7356,1534,5735,3936,4454,2159,7039,5992,8761,2862,56
1,WHOLE COUNTRY,2001,3998.0,330,7464,12018,7949,1831,6858,4214,4950,2418,7365,6340,9461,3086,53
2,WHOLE COUNTRY,2002,3968.0,350,7254,12563,7997,1882,7324,3726,5020,2646,7593,6696,9991,3221,58
3,WHOLE COUNTRY,2003,3848.0,375,7652,12735,7949,1927,7196,3503,5248,2796,7938,6993,10515,3352,78
4,WHOLE COUNTRY,2004,3778.0,388,8329,13618,8026,2152,7811,3769,5640,3122,8277,7328,11092,3524,109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,MK21 Åland,2019,30.2,1.6,84.7,74.8,235.3,27,51.6,67.6,29,12.1,50.1,66.3,138.1,36.9,0.9
476,MK21 Åland,2020,31.3,1.5,89.4,63.4,103.7,17,52.2,72.6,29.1,9.9,47.7,67,143.8,38.4,1.1
477,MK21 Åland,2021,29.3,1.7,94.3,73.5,137.5,28.8,58.9,84.4,29.1,11.7,55.5,72.8,154.1,54,1.3
478,MK21 Åland,2022,21.7,1.7,83,76.5,218.9,33.7,53.9,109.4,29.7,13.9,64,77,166.3,15.6,1.1


In [16]:
gross_value_df["Region"] = (
    gross_value_df["Region"]
    .astype(str)
    .str.split(" ").str[0]  # take only region code
    .str[-2:]
)
gross_value_df

Unnamed: 0,Region,Year,"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)","Gross value added (millions of euro), B Mining and quarrying (05-09)","Gross value added (millions of euro), F Construction (41-43)","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles (45-47)","Gross value added (millions of euro), H Transportation and storage (49-53)","Gross value added (millions of euro), I Accommodation and food service activities (55-56)","Gross value added (millions of euro), J Information and communication (58-63)","Gross value added (millions of euro), K Financial and insurance activities (64-66)","Gross value added (millions of euro), M Professional, scientific and technical activities (69-75)","Gross value added (millions of euro), N Administrative and support service activities (77-82)","Gross value added (millions of euro), O Public administration and defence; compulsory social security (84)","Gross value added (millions of euro), P Education (85)","Gross value added (millions of euro), Q Human health and social work activities (86-88)","Gross value added (millions of euro), R, S Other service activities (90-96)","Gross value added (millions of euro), T Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use(97-98)"
0,LE,2000,4030.0,283,7344,11034,7356,1534,5735,3936,4454,2159,7039,5992,8761,2862,56
1,LE,2001,3998.0,330,7464,12018,7949,1831,6858,4214,4950,2418,7365,6340,9461,3086,53
2,LE,2002,3968.0,350,7254,12563,7997,1882,7324,3726,5020,2646,7593,6696,9991,3221,58
3,LE,2003,3848.0,375,7652,12735,7949,1927,7196,3503,5248,2796,7938,6993,10515,3352,78
4,LE,2004,3778.0,388,8329,13618,8026,2152,7811,3769,5640,3122,8277,7328,11092,3524,109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,21,2019,30.2,1.6,84.7,74.8,235.3,27,51.6,67.6,29,12.1,50.1,66.3,138.1,36.9,0.9
476,21,2020,31.3,1.5,89.4,63.4,103.7,17,52.2,72.6,29.1,9.9,47.7,67,143.8,38.4,1.1
477,21,2021,29.3,1.7,94.3,73.5,137.5,28.8,58.9,84.4,29.1,11.7,55.5,72.8,154.1,54,1.3
478,21,2022,21.7,1.7,83,76.5,218.9,33.7,53.9,109.4,29.7,13.9,64,77,166.3,15.6,1.1


In [17]:
gross_value_df["Region"].replace("LE", "WHOLE COUNTRY", inplace=True)
gross_value_df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  gross_value_df["Region"].replace("LE", "WHOLE COUNTRY", inplace=True)


Unnamed: 0,Region,Year,"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)","Gross value added (millions of euro), B Mining and quarrying (05-09)","Gross value added (millions of euro), F Construction (41-43)","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles (45-47)","Gross value added (millions of euro), H Transportation and storage (49-53)","Gross value added (millions of euro), I Accommodation and food service activities (55-56)","Gross value added (millions of euro), J Information and communication (58-63)","Gross value added (millions of euro), K Financial and insurance activities (64-66)","Gross value added (millions of euro), M Professional, scientific and technical activities (69-75)","Gross value added (millions of euro), N Administrative and support service activities (77-82)","Gross value added (millions of euro), O Public administration and defence; compulsory social security (84)","Gross value added (millions of euro), P Education (85)","Gross value added (millions of euro), Q Human health and social work activities (86-88)","Gross value added (millions of euro), R, S Other service activities (90-96)","Gross value added (millions of euro), T Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use(97-98)"
0,WHOLE COUNTRY,2000,4030.0,283,7344,11034,7356,1534,5735,3936,4454,2159,7039,5992,8761,2862,56
1,WHOLE COUNTRY,2001,3998.0,330,7464,12018,7949,1831,6858,4214,4950,2418,7365,6340,9461,3086,53
2,WHOLE COUNTRY,2002,3968.0,350,7254,12563,7997,1882,7324,3726,5020,2646,7593,6696,9991,3221,58
3,WHOLE COUNTRY,2003,3848.0,375,7652,12735,7949,1927,7196,3503,5248,2796,7938,6993,10515,3352,78
4,WHOLE COUNTRY,2004,3778.0,388,8329,13618,8026,2152,7811,3769,5640,3122,8277,7328,11092,3524,109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,21,2019,30.2,1.6,84.7,74.8,235.3,27,51.6,67.6,29,12.1,50.1,66.3,138.1,36.9,0.9
476,21,2020,31.3,1.5,89.4,63.4,103.7,17,52.2,72.6,29.1,9.9,47.7,67,143.8,38.4,1.1
477,21,2021,29.3,1.7,94.3,73.5,137.5,28.8,58.9,84.4,29.1,11.7,55.5,72.8,154.1,54,1.3
478,21,2022,21.7,1.7,83,76.5,218.9,33.7,53.9,109.4,29.7,13.9,64,77,166.3,15.6,1.1


In [18]:
gross_value_df["Municipality"] = "TOTAL"
gross_value_df

Unnamed: 0,Region,Year,"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)","Gross value added (millions of euro), B Mining and quarrying (05-09)","Gross value added (millions of euro), F Construction (41-43)","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles (45-47)","Gross value added (millions of euro), H Transportation and storage (49-53)","Gross value added (millions of euro), I Accommodation and food service activities (55-56)","Gross value added (millions of euro), J Information and communication (58-63)","Gross value added (millions of euro), K Financial and insurance activities (64-66)","Gross value added (millions of euro), M Professional, scientific and technical activities (69-75)","Gross value added (millions of euro), N Administrative and support service activities (77-82)","Gross value added (millions of euro), O Public administration and defence; compulsory social security (84)","Gross value added (millions of euro), P Education (85)","Gross value added (millions of euro), Q Human health and social work activities (86-88)","Gross value added (millions of euro), R, S Other service activities (90-96)","Gross value added (millions of euro), T Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use(97-98)",Municipality
0,WHOLE COUNTRY,2000,4030.0,283,7344,11034,7356,1534,5735,3936,4454,2159,7039,5992,8761,2862,56,TOTAL
1,WHOLE COUNTRY,2001,3998.0,330,7464,12018,7949,1831,6858,4214,4950,2418,7365,6340,9461,3086,53,TOTAL
2,WHOLE COUNTRY,2002,3968.0,350,7254,12563,7997,1882,7324,3726,5020,2646,7593,6696,9991,3221,58,TOTAL
3,WHOLE COUNTRY,2003,3848.0,375,7652,12735,7949,1927,7196,3503,5248,2796,7938,6993,10515,3352,78,TOTAL
4,WHOLE COUNTRY,2004,3778.0,388,8329,13618,8026,2152,7811,3769,5640,3122,8277,7328,11092,3524,109,TOTAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,21,2019,30.2,1.6,84.7,74.8,235.3,27,51.6,67.6,29,12.1,50.1,66.3,138.1,36.9,0.9,TOTAL
476,21,2020,31.3,1.5,89.4,63.4,103.7,17,52.2,72.6,29.1,9.9,47.7,67,143.8,38.4,1.1,TOTAL
477,21,2021,29.3,1.7,94.3,73.5,137.5,28.8,58.9,84.4,29.1,11.7,55.5,72.8,154.1,54,1.3,TOTAL
478,21,2022,21.7,1.7,83,76.5,218.9,33.7,53.9,109.4,29.7,13.9,64,77,166.3,15.6,1.1,TOTAL


### Merge gross value with GDP_trade dataset

In [19]:
final_df = pd.merge(
    left=final_df,
    right=gross_value_df,
    how="outer",
    on=["Region", "Year", "Municipality"]
)
final_df

Unnamed: 0,Region,Municipality,Year,GDP per capita (euro at current prices),Imports (euro),Exports (euro),"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)","Gross value added (millions of euro), B Mining and quarrying (05-09)","Gross value added (millions of euro), F Construction (41-43)","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles (45-47)",...,"Gross value added (millions of euro), I Accommodation and food service activities (55-56)","Gross value added (millions of euro), J Information and communication (58-63)","Gross value added (millions of euro), K Financial and insurance activities (64-66)","Gross value added (millions of euro), M Professional, scientific and technical activities (69-75)","Gross value added (millions of euro), N Administrative and support service activities (77-82)","Gross value added (millions of euro), O Public administration and defence; compulsory social security (84)","Gross value added (millions of euro), P Education (85)","Gross value added (millions of euro), Q Human health and social work activities (86-88)","Gross value added (millions of euro), R, S Other service activities (90-96)","Gross value added (millions of euro), T Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use(97-98)"
0,01,TOTAL,2000,36270.1,,,255.5,40.8,2501.6,5985.4,...,609.3,3475.9,2205.2,2677.2,1070.4,2374,1808.3,2503.3,1123.2,12.4
1,01,TOTAL,2001,38454.8,,,235.4,43.4,2766.6,6491.5,...,736.1,4055.9,2341.5,2818.8,1202.1,2481.2,1906.9,2767.8,1154.7,7.7
2,01,TOTAL,2002,38552.7,,,246.1,39.1,2449.9,6530.9,...,751.4,4565.4,2108.8,2828.2,1253.7,2506.5,1981.8,2977.4,1256.3,8.9
3,01,TOTAL,2003,38643.2,,,235.6,34.6,2567,6380.4,...,767.4,4293.5,1982.5,2929.8,1366.6,2635.8,2043.4,3056,1299.9,11.4
4,01,TOTAL,2004,40312.2,,,233.9,42.8,2754.4,6842.6,...,851.3,4858.8,2246,3105.6,1545.6,2852.3,2125.5,3181.5,1268.6,16.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514,WHOLE COUNTRY,TOTAL,2019,,,,5223.0,819,14966,17939,...,3649,12305,6028,10861,8137,10529,10861,20277,6101,307
515,WHOLE COUNTRY,TOTAL,2020,,,,5296.0,964,15368,18421,...,2550,12670,6175,11018,7540,10635,11012,20747,5723,328
516,WHOLE COUNTRY,TOTAL,2021,,,,5371.0,1049,14924,19070,...,2562,13796,7046,11745,7788,11161,11560,22151,5982,322
517,WHOLE COUNTRY,TOTAL,2022,,,,5961.0,1100,15411,19557,...,3571,14036,8100,12374,8381,11742,12052,23189,6366,317


### Hours worked

In [20]:
hours_worked_path = notebook_path / "data" / "hours_worked.xlsx"
hours_worked_df = pd.read_excel(hours_worked_path)
hours_worked_df

Unnamed: 0,Region,Year,"Hours worked, domestic (1 000 000 h)"
0,MK01 Uusimaa,2000,1251.708
1,,2001,1267.148
2,,2002,1283.743
3,,2003,1261.744
4,,2004,1261.911
...,...,...,...
432,,2018,28.497
433,,2019,28.935
434,,2020,28.121
435,,2021,26.624


In [21]:
# ffill Region
hours_worked_df["Region"] = hours_worked_df["Region"].ffill()
hours_worked_df

Unnamed: 0,Region,Year,"Hours worked, domestic (1 000 000 h)"
0,MK01 Uusimaa,2000,1251.708
1,MK01 Uusimaa,2001,1267.148
2,MK01 Uusimaa,2002,1283.743
3,MK01 Uusimaa,2003,1261.744
4,MK01 Uusimaa,2004,1261.911
...,...,...,...
432,MK21 Åland,2018,28.497
433,MK21 Åland,2019,28.935
434,MK21 Åland,2020,28.121
435,MK21 Åland,2021,26.624


In [22]:
hours_worked_df["Region"] = (
    hours_worked_df["Region"]
    .astype(str)
    .str.split(" ").str[0]  # take only region code
    .str[-2:]
)
hours_worked_df

Unnamed: 0,Region,Year,"Hours worked, domestic (1 000 000 h)"
0,01,2000,1251.708
1,01,2001,1267.148
2,01,2002,1283.743
3,01,2003,1261.744
4,01,2004,1261.911
...,...,...,...
432,21,2018,28.497
433,21,2019,28.935
434,21,2020,28.121
435,21,2021,26.624


In [23]:
hours_worked_df["Municipality"] = "TOTAL"
hours_worked_df

Unnamed: 0,Region,Year,"Hours worked, domestic (1 000 000 h)",Municipality
0,01,2000,1251.708,TOTAL
1,01,2001,1267.148,TOTAL
2,01,2002,1283.743,TOTAL
3,01,2003,1261.744,TOTAL
4,01,2004,1261.911,TOTAL
...,...,...,...,...
432,21,2018,28.497,TOTAL
433,21,2019,28.935,TOTAL
434,21,2020,28.121,TOTAL
435,21,2021,26.624,TOTAL


In [24]:
final_df["Year"] = final_df["Year"].astype(int)
hours_worked_df["Year"] = hours_worked_df["Year"].astype(int)

In [25]:
final_df = pd.merge(
    left=final_df,
    right=hours_worked_df,
    how="outer",
    on=["Region", "Year", "Municipality"]
)
final_df

Unnamed: 0,Region,Municipality,Year,GDP per capita (euro at current prices),Imports (euro),Exports (euro),"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)","Gross value added (millions of euro), B Mining and quarrying (05-09)","Gross value added (millions of euro), F Construction (41-43)","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles (45-47)",...,"Gross value added (millions of euro), J Information and communication (58-63)","Gross value added (millions of euro), K Financial and insurance activities (64-66)","Gross value added (millions of euro), M Professional, scientific and technical activities (69-75)","Gross value added (millions of euro), N Administrative and support service activities (77-82)","Gross value added (millions of euro), O Public administration and defence; compulsory social security (84)","Gross value added (millions of euro), P Education (85)","Gross value added (millions of euro), Q Human health and social work activities (86-88)","Gross value added (millions of euro), R, S Other service activities (90-96)","Gross value added (millions of euro), T Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use(97-98)","Hours worked, domestic (1 000 000 h)"
0,01,TOTAL,2000,36270.1,,,255.5,40.8,2501.6,5985.4,...,3475.9,2205.2,2677.2,1070.4,2374,1808.3,2503.3,1123.2,12.4,1251.708
1,01,TOTAL,2001,38454.8,,,235.4,43.4,2766.6,6491.5,...,4055.9,2341.5,2818.8,1202.1,2481.2,1906.9,2767.8,1154.7,7.7,1267.148
2,01,TOTAL,2002,38552.7,,,246.1,39.1,2449.9,6530.9,...,4565.4,2108.8,2828.2,1253.7,2506.5,1981.8,2977.4,1256.3,8.9,1283.743
3,01,TOTAL,2003,38643.2,,,235.6,34.6,2567,6380.4,...,4293.5,1982.5,2929.8,1366.6,2635.8,2043.4,3056,1299.9,11.4,1261.744
4,01,TOTAL,2004,40312.2,,,233.9,42.8,2754.4,6842.6,...,4858.8,2246,3105.6,1545.6,2852.3,2125.5,3181.5,1268.6,16.7,1261.911
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514,WHOLE COUNTRY,TOTAL,2019,,,,5223.0,819,14966,17939,...,12305,6028,10861,8137,10529,10861,20277,6101,307,
515,WHOLE COUNTRY,TOTAL,2020,,,,5296.0,964,15368,18421,...,12670,6175,11018,7540,10635,11012,20747,5723,328,
516,WHOLE COUNTRY,TOTAL,2021,,,,5371.0,1049,14924,19070,...,13796,7046,11745,7788,11161,11560,22151,5982,322,
517,WHOLE COUNTRY,TOTAL,2022,,,,5961.0,1100,15411,19557,...,14036,8100,12374,8381,11742,12052,23189,6366,317,


In [26]:
final_df = final_df.sort_values(["Region", "Year"])
final_df

Unnamed: 0,Region,Municipality,Year,GDP per capita (euro at current prices),Imports (euro),Exports (euro),"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)","Gross value added (millions of euro), B Mining and quarrying (05-09)","Gross value added (millions of euro), F Construction (41-43)","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles (45-47)",...,"Gross value added (millions of euro), J Information and communication (58-63)","Gross value added (millions of euro), K Financial and insurance activities (64-66)","Gross value added (millions of euro), M Professional, scientific and technical activities (69-75)","Gross value added (millions of euro), N Administrative and support service activities (77-82)","Gross value added (millions of euro), O Public administration and defence; compulsory social security (84)","Gross value added (millions of euro), P Education (85)","Gross value added (millions of euro), Q Human health and social work activities (86-88)","Gross value added (millions of euro), R, S Other service activities (90-96)","Gross value added (millions of euro), T Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use(97-98)","Hours worked, domestic (1 000 000 h)"
0,01,TOTAL,2000,36270.1,,,255.5,40.8,2501.6,5985.4,...,3475.9,2205.2,2677.2,1070.4,2374,1808.3,2503.3,1123.2,12.4,1251.708
1,01,TOTAL,2001,38454.8,,,235.4,43.4,2766.6,6491.5,...,4055.9,2341.5,2818.8,1202.1,2481.2,1906.9,2767.8,1154.7,7.7,1267.148
2,01,TOTAL,2002,38552.7,,,246.1,39.1,2449.9,6530.9,...,4565.4,2108.8,2828.2,1253.7,2506.5,1981.8,2977.4,1256.3,8.9,1283.743
3,01,TOTAL,2003,38643.2,,,235.6,34.6,2567,6380.4,...,4293.5,1982.5,2929.8,1366.6,2635.8,2043.4,3056,1299.9,11.4,1261.744
4,01,TOTAL,2004,40312.2,,,233.9,42.8,2754.4,6842.6,...,4858.8,2246,3105.6,1545.6,2852.3,2125.5,3181.5,1268.6,16.7,1261.911
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514,WHOLE COUNTRY,TOTAL,2019,,,,5223.0,819,14966,17939,...,12305,6028,10861,8137,10529,10861,20277,6101,307,
515,WHOLE COUNTRY,TOTAL,2020,,,,5296.0,964,15368,18421,...,12670,6175,11018,7540,10635,11012,20747,5723,328,
516,WHOLE COUNTRY,TOTAL,2021,,,,5371.0,1049,14924,19070,...,13796,7046,11745,7788,11161,11560,22151,5982,322,
517,WHOLE COUNTRY,TOTAL,2022,,,,5961.0,1100,15411,19557,...,14036,8100,12374,8381,11742,12052,23189,6366,317,


### Workplace and population (Kareem's dataset)

In [27]:
workplace_population_path = notebook_path / "data" / "workplaceANDpopulation.csv"
workplace_population_df = pd.read_csv(workplace_population_path, dtype={"Region": str, "Municipality": str, "Year": int})
workplace_population_df

Unnamed: 0,Region,Municipality,Year,"A Agriculture, forestry and fishing (TP)",B Mining and quarrying (TP),C Manufacturing (TP),Children aged 0 to 14 (PT),"D Electricity, gas, steam and air conditioning supply (TP)","E Water supply; sewerage, waste management and remediation activities (TP)",Employed (PT),...,Q Human health and social work activities (TP),"R Arts, entertainment and recreation (TP)",S Other service activities (TP),Services (TP),Students (PT),T Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use (TP),U Activities of extraterritorial organisations and bodies (TP),Unemployed (PT),"Workplaces, total (TP)",X Industry unknown (TP)
0,13,77,2010,232,6,154,854,0,6,1994,...,277,20,65,917,340,3,0,330,1402,0
1,13,77,2011,248,8,165,843,0,7,2009,...,221,25,64,854,312,3,0,350,1365,1
2,13,77,2012,234,7,168,836,0,5,1952,...,211,24,67,875,328,5,0,355,1380,1
3,13,77,2013,243,10,176,862,0,4,1916,...,233,12,67,875,299,3,0,357,1385,0
4,13,77,2014,206,10,151,820,0,4,1880,...,247,15,68,892,297,3,0,392,1341,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3173,21,771,2019,18,0,11,155,0,2,500,...,26,2,8,110,40,0,0,25,152,0
3174,21,771,2020,17,0,11,153,0,3,471,...,17,3,6,92,39,0,0,44,138,0
3175,21,771,2021,16,0,10,152,0,1,473,...,22,3,7,95,39,0,0,29,139,0
3176,21,771,2022,27,0,2,143,0,1,463,...,38,2,1,108,52,0,0,22,156,0


In [28]:
workplace_population_df["Region"] = workplace_population_df["Region"].str.zfill(2)
workplace_population_df["Municipality"] = workplace_population_df["Municipality"].str.zfill(3)
workplace_population_df

Unnamed: 0,Region,Municipality,Year,"A Agriculture, forestry and fishing (TP)",B Mining and quarrying (TP),C Manufacturing (TP),Children aged 0 to 14 (PT),"D Electricity, gas, steam and air conditioning supply (TP)","E Water supply; sewerage, waste management and remediation activities (TP)",Employed (PT),...,Q Human health and social work activities (TP),"R Arts, entertainment and recreation (TP)",S Other service activities (TP),Services (TP),Students (PT),T Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use (TP),U Activities of extraterritorial organisations and bodies (TP),Unemployed (PT),"Workplaces, total (TP)",X Industry unknown (TP)
0,13,077,2010,232,6,154,854,0,6,1994,...,277,20,65,917,340,3,0,330,1402,0
1,13,077,2011,248,8,165,843,0,7,2009,...,221,25,64,854,312,3,0,350,1365,1
2,13,077,2012,234,7,168,836,0,5,1952,...,211,24,67,875,328,5,0,355,1380,1
3,13,077,2013,243,10,176,862,0,4,1916,...,233,12,67,875,299,3,0,357,1385,0
4,13,077,2014,206,10,151,820,0,4,1880,...,247,15,68,892,297,3,0,392,1341,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3173,21,771,2019,18,0,11,155,0,2,500,...,26,2,8,110,40,0,0,25,152,0
3174,21,771,2020,17,0,11,153,0,3,471,...,17,3,6,92,39,0,0,44,138,0
3175,21,771,2021,16,0,10,152,0,1,473,...,22,3,7,95,39,0,0,29,139,0
3176,21,771,2022,27,0,2,143,0,1,463,...,38,2,1,108,52,0,0,22,156,0


In [29]:
workplace_population_df = workplace_population_df.sort_values(["Region", "Municipality"])
workplace_population_df

Unnamed: 0,Region,Municipality,Year,"A Agriculture, forestry and fishing (TP)",B Mining and quarrying (TP),C Manufacturing (TP),Children aged 0 to 14 (PT),"D Electricity, gas, steam and air conditioning supply (TP)","E Water supply; sewerage, waste management and remediation activities (TP)",Employed (PT),...,Q Human health and social work activities (TP),"R Arts, entertainment and recreation (TP)",S Other service activities (TP),Services (TP),Students (PT),T Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use (TP),U Activities of extraterritorial organisations and bodies (TP),Unemployed (PT),"Workplaces, total (TP)",X Industry unknown (TP)
2772,01,018,2010,126,1,216,996,0,1,2297,...,98,6,24,532,314,0,0,173,1172,0
2773,01,018,2011,123,0,231,1005,0,2,2320,...,108,7,23,551,315,0,0,165,1191,0
2774,01,018,2012,125,0,233,1035,0,2,2364,...,203,7,31,645,302,0,0,175,1283,0
2775,01,018,2013,111,1,232,1045,0,1,2304,...,206,6,31,640,304,0,0,213,1281,0
2776,01,018,2014,113,1,227,1050,0,1,2306,...,194,4,34,621,309,0,0,244,1256,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3173,21,771,2019,18,0,11,155,0,2,500,...,26,2,8,110,40,0,0,25,152,0
3174,21,771,2020,17,0,11,153,0,3,471,...,17,3,6,92,39,0,0,44,138,0
3175,21,771,2021,16,0,10,152,0,1,473,...,22,3,7,95,39,0,0,29,139,0
3176,21,771,2022,27,0,2,143,0,1,463,...,38,2,1,108,52,0,0,22,156,0


In [30]:
final_df = pd.merge(
    left=final_df,
    right=workplace_population_df,
    how="outer",
    on=["Region", "Year", "Municipality"]
)
final_df

Unnamed: 0,Region,Municipality,Year,GDP per capita (euro at current prices),Imports (euro),Exports (euro),"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)","Gross value added (millions of euro), B Mining and quarrying (05-09)","Gross value added (millions of euro), F Construction (41-43)","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles (45-47)",...,Q Human health and social work activities (TP),"R Arts, entertainment and recreation (TP)",S Other service activities (TP),Services (TP),Students (PT),T Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use (TP),U Activities of extraterritorial organisations and bodies (TP),Unemployed (PT),"Workplaces, total (TP)",X Industry unknown (TP)
0,01,TOTAL,2000,36270.1,,,255.5,40.8,2501.6,5985.4,...,,,,,,,,,,
1,01,TOTAL,2001,38454.8,,,235.4,43.4,2766.6,6491.5,...,,,,,,,,,,
2,01,TOTAL,2002,38552.7,,,246.1,39.1,2449.9,6530.9,...,,,,,,,,,,
3,01,TOTAL,2003,38643.2,,,235.6,34.6,2567,6380.4,...,,,,,,,,,,
4,01,TOTAL,2004,40312.2,,,233.9,42.8,2754.4,6842.6,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3692,WHOLE COUNTRY,TOTAL,2019,,,,5223.0,819,14966,17939,...,,,,,,,,,,
3693,WHOLE COUNTRY,TOTAL,2020,,,,5296.0,964,15368,18421,...,,,,,,,,,,
3694,WHOLE COUNTRY,TOTAL,2021,,,,5371.0,1049,14924,19070,...,,,,,,,,,,
3695,WHOLE COUNTRY,TOTAL,2022,,,,5961.0,1100,15411,19557,...,,,,,,,,,,


### Vacancies and unemployment (Ignacio)

In [31]:
vacancies_unemployment_path = notebook_path / "data" / "vacancies_unemployment.csv"
vacancies_unemployment_df = pd.read_csv(vacancies_unemployment_path, dtype={"Region": str, "Municipality": str, "Year": int})
vacancies_unemployment_df

Unnamed: 0,Region,Municipality,Year,#Unemployed jobseekers,#Vacancies
0,01,018,2008,110,9
1,01,018,2009,166,6
2,01,018,2010,164,6
3,01,018,2011,162,6
4,01,018,2012,157,10
...,...,...,...,...,...
5539,21,941,2021,15,1
5540,21,941,2022,9,1
5541,21,941,2023,8,2
5542,21,941,2024,10,4


In [32]:
vacancies_unemployment_df.rename(columns={
    "#Unemployed jobseekers": "Unemployed jobseekers",
    "#Vacancies": "Vacancies"
    }, 
    inplace=True)

vacancies_unemployment_df.columns

Index(['Region', 'Municipality', 'Year', 'Unemployed jobseekers', 'Vacancies'], dtype='object')

In [33]:
final_df = pd.merge(
    left=final_df,
    right=vacancies_unemployment_df,
    how="outer",
    on=["Region", "Year", "Municipality"]
)
final_df

Unnamed: 0,Region,Municipality,Year,GDP per capita (euro at current prices),Imports (euro),Exports (euro),"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)","Gross value added (millions of euro), B Mining and quarrying (05-09)","Gross value added (millions of euro), F Construction (41-43)","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles (45-47)",...,S Other service activities (TP),Services (TP),Students (PT),T Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use (TP),U Activities of extraterritorial organisations and bodies (TP),Unemployed (PT),"Workplaces, total (TP)",X Industry unknown (TP),Unemployed jobseekers,Vacancies
0,01,TOTAL,2000,36270.1,,,255.5,40.8,2501.6,5985.4,...,,,,,,,,,,
1,01,TOTAL,2001,38454.8,,,235.4,43.4,2766.6,6491.5,...,,,,,,,,,,
2,01,TOTAL,2002,38552.7,,,246.1,39.1,2449.9,6530.9,...,,,,,,,,,,
3,01,TOTAL,2003,38643.2,,,235.6,34.6,2567,6380.4,...,,,,,,,,,,
4,01,TOTAL,2004,40312.2,,,233.9,42.8,2754.4,6842.6,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6058,WHOLE COUNTRY,TOTAL,2019,,,,5223.0,819,14966,17939,...,,,,,,,,,,
6059,WHOLE COUNTRY,TOTAL,2020,,,,5296.0,964,15368,18421,...,,,,,,,,,,
6060,WHOLE COUNTRY,TOTAL,2021,,,,5371.0,1049,14924,19070,...,,,,,,,,,,
6061,WHOLE COUNTRY,TOTAL,2022,,,,5961.0,1100,15411,19557,...,,,,,,,,,,


In [34]:
final_df = final_df.sort_values(["Region", "Municipality", "Year"])
final_df

Unnamed: 0,Region,Municipality,Year,GDP per capita (euro at current prices),Imports (euro),Exports (euro),"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)","Gross value added (millions of euro), B Mining and quarrying (05-09)","Gross value added (millions of euro), F Construction (41-43)","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles (45-47)",...,S Other service activities (TP),Services (TP),Students (PT),T Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use (TP),U Activities of extraterritorial organisations and bodies (TP),Unemployed (PT),"Workplaces, total (TP)",X Industry unknown (TP),Unemployed jobseekers,Vacancies
8,01,018,2008,,,,,,,,...,,,,,,,,,110.0,9.0
35,01,018,2009,,,,,,,,...,,,,,,,,,166.0,6.0
62,01,018,2010,,,,,,,,...,24.0,532.0,314.0,0.0,0.0,173.0,1172.0,0.0,164.0,6.0
89,01,018,2011,,,,,,,,...,23.0,551.0,315.0,0.0,0.0,165.0,1191.0,0.0,162.0,6.0
116,01,018,2012,,,,,,,,...,31.0,645.0,302.0,0.0,0.0,175.0,1283.0,0.0,157.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6058,WHOLE COUNTRY,TOTAL,2019,,,,5223.0,819,14966,17939,...,,,,,,,,,,
6059,WHOLE COUNTRY,TOTAL,2020,,,,5296.0,964,15368,18421,...,,,,,,,,,,
6060,WHOLE COUNTRY,TOTAL,2021,,,,5371.0,1049,14924,19070,...,,,,,,,,,,
6061,WHOLE COUNTRY,TOTAL,2022,,,,5961.0,1100,15411,19557,...,,,,,,,,,,


### Enterprises, bankruptcies (Ignacio)

In [35]:
# data from years 2005 - 2013

enterprise_stock_of_industry_df = notebook_path / "data/openings_closings_2005-2013.xlsx"
enterprise_stock_of_industry_df = pd.read_excel(enterprise_stock_of_industry_df, dtype={"Municipality": str})
enterprise_stock_of_industry_df

Unnamed: 0,Municipality,Year,"A Agriculture, forestry and fishing Openings (unit)",B Mining and quarrying Openings (unit),C Manufacturing Openings (unit),"D Electricity, gas, steam and air conditioning supply Openings (unit)","E Water supply: sewerage, waste management and remediation activities Openings (unit)",F Construction Openings (unit),G Wholesale and retail trade: repair of motor vehicles and motorcycles Openings (unit),H Transportation and storage Openings (unit),...,"M Professional, scientific and technical activities Stock of Enterprises (unit)",N Administrative and support service activities Stock Of Enterprises (unit),O Public administration and defence: compulsory social security Stock Of Enterprises (unit),P Education Stock Of Enterprises (unit),Q Human health and social work activities Stock Of Enterprises (unit),"R Arts, entertainment and recreation Stock Of Enterprises (unit)",S Other service activities Stock Of Enterprises (unit),T Activities of households as employers: undifferentiated goods and servicesproducing activities of households for own use Stock Of Enterprises (unit),U Activities of extraterritorial organisations and bodies Stock Of Enterprises (unit),X Industry unknown Stock Of Enterprises (unit)
0,005 Alajärvi,2005.0,5.0,1.0,7.0,1.0,,17.0,13.0,1.0,...,44.0,29.0,,6.0,14.0,8.0,44.0,,,
1,,2006.0,2.0,,7.0,,2.0,13.0,11.0,,...,46.0,28.0,,7.0,15.0,10.0,46.0,,,
2,,2007.0,4.0,,12.0,,1.0,17.0,10.0,3.0,...,52.0,28.0,,7.0,18.0,8.0,46.0,,,
3,,2008.0,10.0,1.0,7.0,,,7.0,6.0,5.0,...,56.0,27.0,,7.0,20.0,7.0,44.0,,,
4,,2009.0,9.0,1.0,3.0,,,6.0,8.0,4.0,...,49.0,28.0,,6.0,20.0,6.0,41.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2924,,,,,,,,,,,...,,,,,,,,,,
2925,,,,,,,,,,,...,,,,,,,,,,
2926,,,,,,,,,,,...,,,,,,,,,,
2927,Sisäinen viitekoodi:,,,,,,,,,,...,,,,,,,,,,


In [36]:
enterprise_stock_of_industry_df["Municipality"] = enterprise_stock_of_industry_df["Municipality"].ffill()
enterprise_stock_of_industry_df

Unnamed: 0,Municipality,Year,"A Agriculture, forestry and fishing Openings (unit)",B Mining and quarrying Openings (unit),C Manufacturing Openings (unit),"D Electricity, gas, steam and air conditioning supply Openings (unit)","E Water supply: sewerage, waste management and remediation activities Openings (unit)",F Construction Openings (unit),G Wholesale and retail trade: repair of motor vehicles and motorcycles Openings (unit),H Transportation and storage Openings (unit),...,"M Professional, scientific and technical activities Stock of Enterprises (unit)",N Administrative and support service activities Stock Of Enterprises (unit),O Public administration and defence: compulsory social security Stock Of Enterprises (unit),P Education Stock Of Enterprises (unit),Q Human health and social work activities Stock Of Enterprises (unit),"R Arts, entertainment and recreation Stock Of Enterprises (unit)",S Other service activities Stock Of Enterprises (unit),T Activities of households as employers: undifferentiated goods and servicesproducing activities of households for own use Stock Of Enterprises (unit),U Activities of extraterritorial organisations and bodies Stock Of Enterprises (unit),X Industry unknown Stock Of Enterprises (unit)
0,005 Alajärvi,2005.0,5.0,1.0,7.0,1.0,,17.0,13.0,1.0,...,44.0,29.0,,6.0,14.0,8.0,44.0,,,
1,005 Alajärvi,2006.0,2.0,,7.0,,2.0,13.0,11.0,,...,46.0,28.0,,7.0,15.0,10.0,46.0,,,
2,005 Alajärvi,2007.0,4.0,,12.0,,1.0,17.0,10.0,3.0,...,52.0,28.0,,7.0,18.0,8.0,46.0,,,
3,005 Alajärvi,2008.0,10.0,1.0,7.0,,,7.0,6.0,5.0,...,56.0,27.0,,7.0,20.0,7.0,44.0,,,
4,005 Alajärvi,2009.0,9.0,1.0,3.0,,,6.0,8.0,4.0,...,49.0,28.0,,6.0,20.0,6.0,41.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2924,Lukumäärä,,,,,,,,,,...,,,,,,,,,,
2925,Lukumäärä,,,,,,,,,,...,,,,,,,,,,
2926,Lukumäärä,,,,,,,,,,...,,,,,,,,,,
2927,Sisäinen viitekoodi:,,,,,,,,,,...,,,,,,,,,,


In [37]:
split_code_from_name(enterprise_stock_of_industry_df, "Municipality", code_length=3)
enterprise_stock_of_industry_df

Unnamed: 0,Municipality,Year,"A Agriculture, forestry and fishing Openings (unit)",B Mining and quarrying Openings (unit),C Manufacturing Openings (unit),"D Electricity, gas, steam and air conditioning supply Openings (unit)","E Water supply: sewerage, waste management and remediation activities Openings (unit)",F Construction Openings (unit),G Wholesale and retail trade: repair of motor vehicles and motorcycles Openings (unit),H Transportation and storage Openings (unit),...,"M Professional, scientific and technical activities Stock of Enterprises (unit)",N Administrative and support service activities Stock Of Enterprises (unit),O Public administration and defence: compulsory social security Stock Of Enterprises (unit),P Education Stock Of Enterprises (unit),Q Human health and social work activities Stock Of Enterprises (unit),"R Arts, entertainment and recreation Stock Of Enterprises (unit)",S Other service activities Stock Of Enterprises (unit),T Activities of households as employers: undifferentiated goods and servicesproducing activities of households for own use Stock Of Enterprises (unit),U Activities of extraterritorial organisations and bodies Stock Of Enterprises (unit),X Industry unknown Stock Of Enterprises (unit)
0,005,2005.0,5.0,1.0,7.0,1.0,,17.0,13.0,1.0,...,44.0,29.0,,6.0,14.0,8.0,44.0,,,
1,005,2006.0,2.0,,7.0,,2.0,13.0,11.0,,...,46.0,28.0,,7.0,15.0,10.0,46.0,,,
2,005,2007.0,4.0,,12.0,,1.0,17.0,10.0,3.0,...,52.0,28.0,,7.0,18.0,8.0,46.0,,,
3,005,2008.0,10.0,1.0,7.0,,,7.0,6.0,5.0,...,56.0,27.0,,7.0,20.0,7.0,44.0,,,
4,005,2009.0,9.0,1.0,3.0,,,6.0,8.0,4.0,...,49.0,28.0,,6.0,20.0,6.0,41.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2924,ärä,,,,,,,,,,...,,,,,,,,,,
2925,ärä,,,,,,,,,,...,,,,,,,,,,
2926,ärä,,,,,,,,,,...,,,,,,,,,,
2927,nen,,,,,,,,,,...,,,,,,,,,,


In [38]:
# match Municipality code to corresponding Region code

region_code_mapping_path = notebook_path / "data/municipality-region-correspondence.csv"
region_code_mapping_df = pd.read_csv(region_code_mapping_path, sep=";", dtype={"sourceCode": str, "targetCode": str})
region_code_mapping_df = region_code_mapping_df[["sourceCode", "targetCode"]]
region_code_mapping_df

Unnamed: 0,sourceCode,targetCode
0,020,06
1,005,14
2,009,17
3,010,14
4,016,07
...,...,...
304,977,17
305,980,06
306,981,05
307,989,14


In [39]:
# create a new Region column in enterprise_stock_of_industry_df by mapping Municipality to Region using region_code_mapping_df
enterprise_stock_of_industry_df = pd.merge(
    left=enterprise_stock_of_industry_df,
    right=region_code_mapping_df,
    how="left",
    left_on="Municipality",
    right_on="sourceCode"
)
enterprise_stock_of_industry_df

Unnamed: 0,Municipality,Year,"A Agriculture, forestry and fishing Openings (unit)",B Mining and quarrying Openings (unit),C Manufacturing Openings (unit),"D Electricity, gas, steam and air conditioning supply Openings (unit)","E Water supply: sewerage, waste management and remediation activities Openings (unit)",F Construction Openings (unit),G Wholesale and retail trade: repair of motor vehicles and motorcycles Openings (unit),H Transportation and storage Openings (unit),...,O Public administration and defence: compulsory social security Stock Of Enterprises (unit),P Education Stock Of Enterprises (unit),Q Human health and social work activities Stock Of Enterprises (unit),"R Arts, entertainment and recreation Stock Of Enterprises (unit)",S Other service activities Stock Of Enterprises (unit),T Activities of households as employers: undifferentiated goods and servicesproducing activities of households for own use Stock Of Enterprises (unit),U Activities of extraterritorial organisations and bodies Stock Of Enterprises (unit),X Industry unknown Stock Of Enterprises (unit),sourceCode,targetCode
0,005,2005.0,5.0,1.0,7.0,1.0,,17.0,13.0,1.0,...,,6.0,14.0,8.0,44.0,,,,005,14
1,005,2006.0,2.0,,7.0,,2.0,13.0,11.0,,...,,7.0,15.0,10.0,46.0,,,,005,14
2,005,2007.0,4.0,,12.0,,1.0,17.0,10.0,3.0,...,,7.0,18.0,8.0,46.0,,,,005,14
3,005,2008.0,10.0,1.0,7.0,,,7.0,6.0,5.0,...,,7.0,20.0,7.0,44.0,,,,005,14
4,005,2009.0,9.0,1.0,3.0,,,6.0,8.0,4.0,...,,6.0,20.0,6.0,41.0,,,,005,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2924,ärä,,,,,,,,,,...,,,,,,,,,,
2925,ärä,,,,,,,,,,...,,,,,,,,,,
2926,ärä,,,,,,,,,,...,,,,,,,,,,
2927,nen,,,,,,,,,,...,,,,,,,,,,


In [40]:
enterprise_stock_of_industry_df.drop(columns=["sourceCode"], inplace=True)
enterprise_stock_of_industry_df.rename(columns={"targetCode": "Region"}, inplace=True)
enterprise_stock_of_industry_df

Unnamed: 0,Municipality,Year,"A Agriculture, forestry and fishing Openings (unit)",B Mining and quarrying Openings (unit),C Manufacturing Openings (unit),"D Electricity, gas, steam and air conditioning supply Openings (unit)","E Water supply: sewerage, waste management and remediation activities Openings (unit)",F Construction Openings (unit),G Wholesale and retail trade: repair of motor vehicles and motorcycles Openings (unit),H Transportation and storage Openings (unit),...,N Administrative and support service activities Stock Of Enterprises (unit),O Public administration and defence: compulsory social security Stock Of Enterprises (unit),P Education Stock Of Enterprises (unit),Q Human health and social work activities Stock Of Enterprises (unit),"R Arts, entertainment and recreation Stock Of Enterprises (unit)",S Other service activities Stock Of Enterprises (unit),T Activities of households as employers: undifferentiated goods and servicesproducing activities of households for own use Stock Of Enterprises (unit),U Activities of extraterritorial organisations and bodies Stock Of Enterprises (unit),X Industry unknown Stock Of Enterprises (unit),Region
0,005,2005.0,5.0,1.0,7.0,1.0,,17.0,13.0,1.0,...,29.0,,6.0,14.0,8.0,44.0,,,,14
1,005,2006.0,2.0,,7.0,,2.0,13.0,11.0,,...,28.0,,7.0,15.0,10.0,46.0,,,,14
2,005,2007.0,4.0,,12.0,,1.0,17.0,10.0,3.0,...,28.0,,7.0,18.0,8.0,46.0,,,,14
3,005,2008.0,10.0,1.0,7.0,,,7.0,6.0,5.0,...,27.0,,7.0,20.0,7.0,44.0,,,,14
4,005,2009.0,9.0,1.0,3.0,,,6.0,8.0,4.0,...,28.0,,6.0,20.0,6.0,41.0,,,,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2924,ärä,,,,,,,,,,...,,,,,,,,,,
2925,ärä,,,,,,,,,,...,,,,,,,,,,
2926,ärä,,,,,,,,,,...,,,,,,,,,,
2927,nen,,,,,,,,,,...,,,,,,,,,,


In [41]:
# save to own csv so that Ignacio can independently work with it
enterprise_stock_of_industry_df.to_csv(notebook_path / "data/stock_of_enterprises_2003_2012.csv", index=False)

In [42]:
stock_of_enterprise_df = pd.read_csv(notebook_path / "data/stock_of_enterprises_2003_2012.csv", dtype={"Region": str, "Municipality": str, "Year": int})
stock_of_enterprise_df

  chunks = self._reader.read_low_memory(nrows)


ValueError: cannot safely convert passed user dtype of int64 for float64 dtyped data in column 1

In [None]:
for col in stock_of_enterprise_df.columns[stock_of_enterprise_df.columns.str.contains("Net openings")]:
    col_industry = col.replace(" Net openings (unit)", "")
    openings = stock_of_enterprise_df[col_industry + " Openings (unit)"]
    closings = stock_of_enterprise_df[col_industry + " Closings (unit)"]
    stock_of_enterprise_df[col] = stock_of_enterprise_df[openings] - stock_of_enterprise_df[closings]

A Agriculture, forestry and fishing Net openings (unit)
B Mining and quarrying Net openings (unit)
C Manufacturing Net openings (unit)
D Electricity,  gas, steam and air conditioning supply Net openings (unit)
E Water supply: sewerage, waste management and remediation activities Net openings (unit)
F Construction Net openings (unit)
G Wholesale and retail trade: repair of motor vehicles and motorcycles Net openings (unit)
H Transportation and storage Net openings (unit)
I Accommodation and food service activities Net openings (unit)
J Information and communication Net openings (unit)
K Financial and insurance activities Net openings (unit)
L Real estate activities Net openings (unit)
M Professional, scientific and technical activities Net openings (unit)
N Administrative and support service activities Net openings (unit)
O Public administration and defence: compulsory social security Net openings (unit)
P Education Net openings (unit)
Q Human health and social work activities Net openi

In [None]:
enterprise_bankruptcy_path = notebook_path / "data" / "enterprises_bankrupcies_investments.csv"
enterprise_bankruptcy_df = pd.read_csv(enterprise_bankruptcy_path, dtype={"Region": str, "Municipality": str, "Year": int})
enterprise_bankruptcy_df

Unnamed: 0,Region,Municipality,Year,"A Agriculture, forestry and fishing Stock Of Enterprises (unit)",B Mining and quarrying Stock Of Enterprises (unit),C Manufacturing Stock Of Enterprises (unit),"D Electricity, gas, steam and air conditioning supply Stock Of Enterprises (unit)","E Water supply: sewerage, waste management and remediation activities Stock Of Enterprises (unit)",F Construction Stock Of Enterprises (unit),G Wholesale and retail trade: repair of motor vehicles and motorcycles Stock Of Enterprises (unit),...,N Administrative and support service activities Investments (mln of euro),O Public administration and defence; compulsory social security Investments (mln of euro),P Education Investments (mln of euro),Q Human health and social work activities Investments (mln of euro),T Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use Investments (mln of euro),C Manufacturing Investments (mln of euro),D Electricity Investments (mln of euro),E Water Supply Investments (mln of euro),R Human health and social work activities Investments (mln of euro),"S Arts, sports and recreation Investments (mln of euro)"
0,01,018,2003,,,,,,,,...,,,,,,,,,,
1,01,018,2004,,,,,,,,...,,,,,,,,,,
2,01,018,2005,,,,,,,,...,,,,,,,,,,
3,01,018,2006,,,,,,,,...,,,,,,,,,,
4,01,018,2007,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7279,911,TOTAL,2014,0.0,0.0,0.0,0.0,1.0,2.0,2.0,...,,,,,,,,,,
7280,911,TOTAL,2015,-1.0,0.0,0.0,0.0,0.0,-1.0,3.0,...,,,,,,,,,,
7281,911,TOTAL,2016,-2.0,0.0,1.0,0.0,0.0,-2.0,-2.0,...,,,,,,,,,,
7282,911,TOTAL,2017,2.0,0.0,1.0,0.0,0.0,0.0,2.0,...,,,,,,,,,,


In [None]:
# merge enterprise_stock_of_industry_df with enterprise_bankruptcy_df (all columns containing "Stock of Enterprises")

enterprise_bankruptcy_df_0 = pd.merge(
    left=enterprise_bankruptcy_df,
    right=enterprise_stock_of_industry_df,
    how="outer",
    on=["Region", "Year", "Municipality"]
)
enterprise_bankruptcy_df_0

KeyError: 'Region'

In [None]:
final_df = pd.merge(
    left=final_df,
    right=enterprise_bankruptcy_wide_df,
    how="outer",
    on=["Region", "Year", "Municipality"]
)
final_df

Unnamed: 0,Region,Municipality,Year,GDP per capita (euro at current prices),Imports (euro),Exports (euro),"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)","Gross value added (millions of euro), B Mining and quarrying (05-09)","Gross value added (millions of euro), F Construction (41-43)","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles (45-47)",...,Vacancies,Bankruptcies in agriculture,Bankruptcies in industry,Bankruptcies in services,Enterprises opening-closing in agriculture,Enterprises opening-closing in industry,Enterprises opening-closing in services,Investments in agriculture (mln eur),Investments in industry (mln eur),Investments in services (mln eur)
0,01,TOTAL,2000,36270.1,,,255.5,40.8,2501.6,5985.4,...,,0.0,0.0,0.0,0.0,0.0,0.0,98.0,2491.4,8837.6
1,01,TOTAL,2001,38454.8,,,235.4,43.4,2766.6,6491.5,...,,0.0,0.0,0.0,0.0,0.0,0.0,103.4,2471.4,9357.3
2,01,TOTAL,2002,38552.7,,,246.1,39.1,2449.9,6530.9,...,,0.0,0.0,0.0,0.0,0.0,0.0,93.1,2279.7,9569.3
3,01,018,2003,,,,,,,,...,,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,01,049,2003,,,,,,,,...,,0.0,6.0,59.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7634,WHOLE COUNTRY,TOTAL,2019,,,,5223.0,819,14966,17939,...,,,,,,,,,,
7635,WHOLE COUNTRY,TOTAL,2020,,,,5296.0,964,15368,18421,...,,,,,,,,,,
7636,WHOLE COUNTRY,TOTAL,2021,,,,5371.0,1049,14924,19070,...,,,,,,,,,,
7637,WHOLE COUNTRY,TOTAL,2022,,,,5961.0,1100,15411,19557,...,,,,,,,,,,


### Household income, expenditure, and debt (Khanh)

In [42]:
income_expenditure_debt_path = notebook_path / "data" / "Household income, expenditure and debt.csv"
income_expenditure_debt_df = pd.read_csv(income_expenditure_debt_path, dtype={"Region": str, "Municipality": str, "Year": int})
income_expenditure_debt_df

Unnamed: 0,region,municipality,year,Median of debt for All debts,Median of debt for Consumption debts and other debts,Median of debt for Debts charged on business activities and a source of income,Median of debt for Housing loan debts,Median of debt for Study loan debts,Mean of debt for All debts,Mean of debt for Consumption debts and other debts,...,"D12R Employer's social security contributions, income","D4OMR Property income, (incl. holding gains and losses), income","D4K Property income, expenditure",B5N Net national income,"D62R Social benefits other than social transfers in kind, income","D7R Other current transfers, income","D5K Income, property and other direct taxes excl. capital taxes, expenditure",D61K Employer's social security contributions,"D7K Other current transfers, expenditure","B6N Disposable income, net"
0,1,018,2003,59388.0,11716.0,40324.0,61190.0,4344.0,80012.0000,20022.0000,...,,,,,,,,,,
1,1,018,2004,58848.0,12650.0,39612.0,65961.0,4353.0,83471.0000,21301.0000,...,,,,,,,,,,
2,1,018,2005,69631.0,14614.0,43714.0,79899.0,4612.0,105231.0000,31828.0000,...,,,,,,,,,,
3,1,018,2006,81678.0,16863.0,39098.0,88128.0,4851.0,110952.0000,31722.0000,...,,,,,,,,,,
4,1,018,2007,84166.0,16356.0,41275.0,97776.0,3894.0,118083.0000,31443.0000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7208,21,Total,2018,,,,,53626.0,135757.5000,54686.4375,...,110.0,100.0,5.0,844.0,218.0,13.0,155.0,169.0,34.0,717.0
7209,21,Total,2019,,,,,55975.0,136165.5000,54918.5000,...,113.0,104.0,5.0,884.0,226.0,14.0,171.0,176.0,36.0,741.0
7210,21,Total,2020,,,,,65660.0,139930.0625,56271.7500,...,103.0,106.0,4.0,863.0,240.0,13.0,170.0,166.0,37.0,744.0
7211,21,Total,2021,,,,,64339.0,136181.1875,55449.0625,...,114.0,181.0,6.0,968.0,242.0,14.0,189.0,180.0,42.0,812.0


In [43]:
income_expenditure_debt_df.columns

Index(['region', 'municipality', 'year', 'Median of debt for All debts',
       'Median of debt for Consumption debts and other debts',
       'Median of debt for Debts charged on business activities and a source of income',
       'Median of debt for Housing loan debts',
       'Median of debt for Study loan debts', 'Mean of debt for All debts',
       'Mean of debt for Consumption debts and other debts',
       'Mean of debt for Debts charged on business activities and a source of income',
       'Mean of debt for Housing loan debts',
       'Mean of debt for Study loan debts', 'B2N Operating surplus, net',
       'B3N Mixed income, net', 'D11R Wages and salaries received, income',
       'D12R Employer's social security contributions, income',
       'D4OMR  Property income, (incl. holding gains and losses), income',
       'D4K Property income, expenditure', 'B5N Net national income',
       'D62R Social benefits other than social transfers in kind, income',
       'D7R Other curre

In [44]:
# rename columns
income_expenditure_debt_df = income_expenditure_debt_df.rename(columns={
    'region': "Region", 
    'municipality': "Municipality", 
    'year': "Year", 
    'Median of debt for All debts': 'Median of debt for all debts',
    'Median of debt for Consumption debts and other debts': "Median of debt for consumption debts and other debts",
    'Median of debt for Debts charged on business activities and a source of income': "Median of debt for debts charged on business activities and a source of income",
    'Median of debt for Housing loan debts': 'Median of debt for housing loan debts',
    'Median of debt for Study loan debts': 'Median of debt for study loan debts', 
    'Mean of debt for All debts': 'Mean of debt for all debts',
    'Mean of debt for Consumption debts and other debts': "Mean of debt for consumption debts and other debts",
    'Mean of debt for Debts charged on business activities and a source of income': "Mean of debt for debts charged on business activities and a source of income",
    'Mean of debt for Housing loan debts': "Mean of debt for housing loan debts",
    'Mean of debt for Study loan debts': "Mean of debt for study loan debts", 
    'B2N Operating surplus, net': "Operating surplus, net",
    'B3N Mixed income, net': "Mixed income, net", 
    'D11R Wages and salaries received, income': "Wages and salaries received, income",
    "D12R Employer's social security contributions, income": "Employer's social security contributions, income",
    'D4OMR  Property income, (incl. holding gains and losses), income': "Property income, (incl. holding gains and losses), income",
    'D4K Property income, expenditure': "Property income, expenditure", 
    'B5N Net national income': "Net national income",
    'D62R Social benefits other than social transfers in kind, income': "Social benefits other than social transfers in kind, income",
    'D7R Other current transfers, income': "Other current transfers, income",
    'D5K Income, property and other direct taxes excl. capital taxes, expenditure': "Income, property and other direct taxes excl. capital taxes, expenditure",
    "D61K Employer's social security contributions": "Employer's social security contributions",
    'D7K Other current transfers, expenditure': "Other current transfers, expenditure",
    'B6N Disposable income, net': "Disposable income, net"
})

income_expenditure_debt_df

Unnamed: 0,Region,Municipality,Year,Median of debt for all debts,Median of debt for consumption debts and other debts,Median of debt for debts charged on business activities and a source of income,Median of debt for housing loan debts,Median of debt for study loan debts,Mean of debt for all debts,Mean of debt for consumption debts and other debts,...,"Employer's social security contributions, income","Property income, (incl. holding gains and losses), income","Property income, expenditure",Net national income,"Social benefits other than social transfers in kind, income","Other current transfers, income","Income, property and other direct taxes excl. capital taxes, expenditure",Employer's social security contributions,"Other current transfers, expenditure","Disposable income, net"
0,1,018,2003,59388.0,11716.0,40324.0,61190.0,4344.0,80012.0000,20022.0000,...,,,,,,,,,,
1,1,018,2004,58848.0,12650.0,39612.0,65961.0,4353.0,83471.0000,21301.0000,...,,,,,,,,,,
2,1,018,2005,69631.0,14614.0,43714.0,79899.0,4612.0,105231.0000,31828.0000,...,,,,,,,,,,
3,1,018,2006,81678.0,16863.0,39098.0,88128.0,4851.0,110952.0000,31722.0000,...,,,,,,,,,,
4,1,018,2007,84166.0,16356.0,41275.0,97776.0,3894.0,118083.0000,31443.0000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7208,21,Total,2018,,,,,53626.0,135757.5000,54686.4375,...,110.0,100.0,5.0,844.0,218.0,13.0,155.0,169.0,34.0,717.0
7209,21,Total,2019,,,,,55975.0,136165.5000,54918.5000,...,113.0,104.0,5.0,884.0,226.0,14.0,171.0,176.0,36.0,741.0
7210,21,Total,2020,,,,,65660.0,139930.0625,56271.7500,...,103.0,106.0,4.0,863.0,240.0,13.0,170.0,166.0,37.0,744.0
7211,21,Total,2021,,,,,64339.0,136181.1875,55449.0625,...,114.0,181.0,6.0,968.0,242.0,14.0,189.0,180.0,42.0,812.0


In [45]:
# pad Region column with zeros to length 2
income_expenditure_debt_df["Region"] = income_expenditure_debt_df["Region"].astype(str).str.zfill(2)
income_expenditure_debt_df

Unnamed: 0,Region,Municipality,Year,Median of debt for all debts,Median of debt for consumption debts and other debts,Median of debt for debts charged on business activities and a source of income,Median of debt for housing loan debts,Median of debt for study loan debts,Mean of debt for all debts,Mean of debt for consumption debts and other debts,...,"Employer's social security contributions, income","Property income, (incl. holding gains and losses), income","Property income, expenditure",Net national income,"Social benefits other than social transfers in kind, income","Other current transfers, income","Income, property and other direct taxes excl. capital taxes, expenditure",Employer's social security contributions,"Other current transfers, expenditure","Disposable income, net"
0,01,018,2003,59388.0,11716.0,40324.0,61190.0,4344.0,80012.0000,20022.0000,...,,,,,,,,,,
1,01,018,2004,58848.0,12650.0,39612.0,65961.0,4353.0,83471.0000,21301.0000,...,,,,,,,,,,
2,01,018,2005,69631.0,14614.0,43714.0,79899.0,4612.0,105231.0000,31828.0000,...,,,,,,,,,,
3,01,018,2006,81678.0,16863.0,39098.0,88128.0,4851.0,110952.0000,31722.0000,...,,,,,,,,,,
4,01,018,2007,84166.0,16356.0,41275.0,97776.0,3894.0,118083.0000,31443.0000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7208,21,Total,2018,,,,,53626.0,135757.5000,54686.4375,...,110.0,100.0,5.0,844.0,218.0,13.0,155.0,169.0,34.0,717.0
7209,21,Total,2019,,,,,55975.0,136165.5000,54918.5000,...,113.0,104.0,5.0,884.0,226.0,14.0,171.0,176.0,36.0,741.0
7210,21,Total,2020,,,,,65660.0,139930.0625,56271.7500,...,103.0,106.0,4.0,863.0,240.0,13.0,170.0,166.0,37.0,744.0
7211,21,Total,2021,,,,,64339.0,136181.1875,55449.0625,...,114.0,181.0,6.0,968.0,242.0,14.0,189.0,180.0,42.0,812.0


In [46]:
# all instances of "total" in Municipality replaced by "TOTAL"
income_expenditure_debt_df["Municipality"] = income_expenditure_debt_df["Municipality"].replace("Total", "TOTAL")
income_expenditure_debt_df

Unnamed: 0,Region,Municipality,Year,Median of debt for all debts,Median of debt for consumption debts and other debts,Median of debt for debts charged on business activities and a source of income,Median of debt for housing loan debts,Median of debt for study loan debts,Mean of debt for all debts,Mean of debt for consumption debts and other debts,...,"Employer's social security contributions, income","Property income, (incl. holding gains and losses), income","Property income, expenditure",Net national income,"Social benefits other than social transfers in kind, income","Other current transfers, income","Income, property and other direct taxes excl. capital taxes, expenditure",Employer's social security contributions,"Other current transfers, expenditure","Disposable income, net"
0,01,018,2003,59388.0,11716.0,40324.0,61190.0,4344.0,80012.0000,20022.0000,...,,,,,,,,,,
1,01,018,2004,58848.0,12650.0,39612.0,65961.0,4353.0,83471.0000,21301.0000,...,,,,,,,,,,
2,01,018,2005,69631.0,14614.0,43714.0,79899.0,4612.0,105231.0000,31828.0000,...,,,,,,,,,,
3,01,018,2006,81678.0,16863.0,39098.0,88128.0,4851.0,110952.0000,31722.0000,...,,,,,,,,,,
4,01,018,2007,84166.0,16356.0,41275.0,97776.0,3894.0,118083.0000,31443.0000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7208,21,TOTAL,2018,,,,,53626.0,135757.5000,54686.4375,...,110.0,100.0,5.0,844.0,218.0,13.0,155.0,169.0,34.0,717.0
7209,21,TOTAL,2019,,,,,55975.0,136165.5000,54918.5000,...,113.0,104.0,5.0,884.0,226.0,14.0,171.0,176.0,36.0,741.0
7210,21,TOTAL,2020,,,,,65660.0,139930.0625,56271.7500,...,103.0,106.0,4.0,863.0,240.0,13.0,170.0,166.0,37.0,744.0
7211,21,TOTAL,2021,,,,,64339.0,136181.1875,55449.0625,...,114.0,181.0,6.0,968.0,242.0,14.0,189.0,180.0,42.0,812.0


In [47]:
final_df = pd.merge(
    left=final_df,
    right=income_expenditure_debt_df,
    how="outer",
    on=["Region", "Year", "Municipality"]
)
final_df

Unnamed: 0,Region,Municipality,Year,GDP per capita (euro at current prices),Imports (euro),Exports (euro),"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)","Gross value added (millions of euro), B Mining and quarrying (05-09)","Gross value added (millions of euro), F Construction (41-43)","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles (45-47)",...,"Employer's social security contributions, income","Property income, (incl. holding gains and losses), income","Property income, expenditure",Net national income,"Social benefits other than social transfers in kind, income","Other current transfers, income","Income, property and other direct taxes excl. capital taxes, expenditure",Employer's social security contributions,"Other current transfers, expenditure","Disposable income, net"
0,01,TOTAL,2000,36270.1,,,255.5,40.8,2501.6,5985.4,...,4676.0,4500.0,534.0,29720.0,5664.0,311.0,7232.0,6341.0,676.0,21447.0
1,01,TOTAL,2001,38454.8,,,235.4,43.4,2766.6,6491.5,...,5035.0,3355.0,608.0,30045.0,5879.0,325.0,7428.0,6746.0,734.0,21341.0
2,01,TOTAL,2002,38552.7,,,246.1,39.1,2449.9,6530.9,...,5100.0,3043.0,548.0,30730.0,6271.0,323.0,7514.0,6760.0,745.0,22305.0
3,01,018,2003,,,,,,,,...,,,,,,,,,,
4,01,049,2003,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7598,WHOLE COUNTRY,TOTAL,2019,,,,5223.0,819,14966,17939,...,,,,,,,,,,
7599,WHOLE COUNTRY,TOTAL,2020,,,,5296.0,964,15368,18421,...,,,,,,,,,,
7600,WHOLE COUNTRY,TOTAL,2021,,,,5371.0,1049,14924,19070,...,,,,,,,,,,
7601,WHOLE COUNTRY,TOTAL,2022,,,,5961.0,1100,15411,19557,...,,,,,,,,,,


### Education, population (Alina)

In [48]:
education_population_path = notebook_path / "data" / "education_population.csv"
education_population_df = pd.read_csv(education_population_path, dtype={"Region": str, "Municipality": str, "Year": int})
education_population_df

Unnamed: 0,Region,Municipality,Year,"Academic degree, higher university level degree (population)","Academic degree, lowest level tertiary and lower university level degrees (population)","Aged 18 or over, total (population)",Basic level studies (population),Matriculation examination (population),Vocational diploma (population),"With education, total (population)",...,65 - 69 Urban (population),70 - 74 Rural (population),70 - 74 Total (population),70 - 74 Urban (population),75- Rural (population),75- Total (population),75- Urban (population),Total Rural (population),Total (population),Total Urban (population)
0,01,018,2010,147,603,3628,1222,188,1468,2406,...,0,160,160,0,294,294,0,4829,4829,0
1,01,018,2011,161,619,3677,1210,168,1519,2467,...,0,170,170,0,309,309,0,4876,4876,0
2,01,018,2012,165,644,3721,1174,175,1563,2547,...,0,166,166,0,318,318,0,4949,4949,0
3,01,018,2013,172,656,3713,1121,183,1581,2592,...,0,166,166,0,321,321,0,4951,4951,0
4,01,018,2014,182,667,3788,1117,193,1629,2671,...,0,164,164,0,331,331,0,5028,5028,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4601,WHOLE COUNTRY,TOTAL,2019,514587,984147,4476235,1040869,305649,1630983,3435366,...,229077,118532,347456,228924,186557,520476,333919,1492023,5456236,3964213
4602,WHOLE COUNTRY,TOTAL,2020,532586,992956,4492267,1023598,305721,1637406,3468669,...,227259,120926,351851,230925,192078,543585,351507,1479096,5469271,3990175
4603,WHOLE COUNTRY,TOTAL,2021,548937,1002269,4512724,1010850,307326,1643342,3501874,...,227552,120136,346399,226263,199314,571475,372161,1468856,5482367,4013511
4604,WHOLE COUNTRY,TOTAL,2022,564198,1008725,4537778,1007600,310102,1647153,3530178,...,225310,118975,341125,222150,205631,596017,390386,1450321,5494815,4044494


In [49]:
final_df = pd.merge(
    left=final_df,
    right=education_population_df,
    how="outer",
    on=["Region", "Year", "Municipality"]
)
final_df

Unnamed: 0,Region,Municipality,Year,GDP per capita (euro at current prices),Imports (euro),Exports (euro),"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)","Gross value added (millions of euro), B Mining and quarrying (05-09)","Gross value added (millions of euro), F Construction (41-43)","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles (45-47)",...,65 - 69 Urban (population),70 - 74 Rural (population),70 - 74 Total (population),70 - 74 Urban (population),75- Rural (population),75- Total (population),75- Urban (population),Total Rural (population),Total (population),Total Urban (population)
0,01,TOTAL,2000,36270.1,,,255.5,40.8,2501.6,5985.4,...,,,,,,,,,,
1,01,TOTAL,2001,38454.8,,,235.4,43.4,2766.6,6491.5,...,,,,,,,,,,
2,01,TOTAL,2002,38552.7,,,246.1,39.1,2449.9,6530.9,...,,,,,,,,,,
3,01,018,2003,,,,,,,,...,,,,,,,,,,
4,01,049,2003,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7612,WHOLE COUNTRY,TOTAL,2019,,,,5223.0,819,14966,17939,...,229077.0,118532.0,347456.0,228924.0,186557.0,520476.0,333919.0,1492023.0,5456236.0,3964213.0
7613,WHOLE COUNTRY,TOTAL,2020,,,,5296.0,964,15368,18421,...,227259.0,120926.0,351851.0,230925.0,192078.0,543585.0,351507.0,1479096.0,5469271.0,3990175.0
7614,WHOLE COUNTRY,TOTAL,2021,,,,5371.0,1049,14924,19070,...,227552.0,120136.0,346399.0,226263.0,199314.0,571475.0,372161.0,1468856.0,5482367.0,4013511.0
7615,WHOLE COUNTRY,TOTAL,2022,,,,5961.0,1100,15411,19557,...,225310.0,118975.0,341125.0,222150.0,205631.0,596017.0,390386.0,1450321.0,5494815.0,4044494.0


### Final formatting

In [58]:
final_df = final_df.sort_values(["Region", "Municipality", "Year"])
final_df

Unnamed: 0,Region,Municipality,Year,GDP per capita (euro at current prices),Imports (euro),Exports (euro),"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)","Gross value added (millions of euro), B Mining and quarrying (05-09)","Gross value added (millions of euro), F Construction (41-43)","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles (45-47)",...,65 - 69 Urban (population),70 - 74 Rural (population),70 - 74 Total (population),70 - 74 Urban (population),75- Rural (population),75- Total (population),75- Urban (population),Total Rural (population),Total (population),Total Urban (population)
3,01,018,2003,,,,,,,,...,,,,,,,,,,
30,01,018,2004,,,,,,,,...,,,,,,,,,,
57,01,018,2005,,,,,,,,...,,,,,,,,,,
84,01,018,2006,,,,,,,,...,,,,,,,,,,
111,01,018,2007,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7612,WHOLE COUNTRY,TOTAL,2019,,,,5223.0,819,14966,17939,...,229077.0,118532.0,347456.0,228924.0,186557.0,520476.0,333919.0,1492023.0,5456236.0,3964213.0
7613,WHOLE COUNTRY,TOTAL,2020,,,,5296.0,964,15368,18421,...,227259.0,120926.0,351851.0,230925.0,192078.0,543585.0,351507.0,1479096.0,5469271.0,3990175.0
7614,WHOLE COUNTRY,TOTAL,2021,,,,5371.0,1049,14924,19070,...,227552.0,120136.0,346399.0,226263.0,199314.0,571475.0,372161.0,1468856.0,5482367.0,4013511.0
7615,WHOLE COUNTRY,TOTAL,2022,,,,5961.0,1100,15411,19557,...,225310.0,118975.0,341125.0,222150.0,205631.0,596017.0,390386.0,1450321.0,5494815.0,4044494.0


In [60]:
final_df

Unnamed: 0,Region,Municipality,Year,GDP per capita (euro at current prices),Imports (euro),Exports (euro),"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)","Gross value added (millions of euro), B Mining and quarrying (05-09)","Gross value added (millions of euro), F Construction (41-43)","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles (45-47)",...,65 - 69 Urban (population),70 - 74 Rural (population),70 - 74 Total (population),70 - 74 Urban (population),75- Rural (population),75- Total (population),75- Urban (population),Total Rural (population),Total (population),Total Urban (population)
3,01,018,2003,,,,,,,,...,,,,,,,,,,
30,01,018,2004,,,,,,,,...,,,,,,,,,,
57,01,018,2005,,,,,,,,...,,,,,,,,,,
84,01,018,2006,,,,,,,,...,,,,,,,,,,
111,01,018,2007,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7612,WHOLE COUNTRY,TOTAL,2019,,,,5223.0,819,14966,17939,...,229077.0,118532.0,347456.0,228924.0,186557.0,520476.0,333919.0,1492023.0,5456236.0,3964213.0
7613,WHOLE COUNTRY,TOTAL,2020,,,,5296.0,964,15368,18421,...,227259.0,120926.0,351851.0,230925.0,192078.0,543585.0,351507.0,1479096.0,5469271.0,3990175.0
7614,WHOLE COUNTRY,TOTAL,2021,,,,5371.0,1049,14924,19070,...,227552.0,120136.0,346399.0,226263.0,199314.0,571475.0,372161.0,1468856.0,5482367.0,4013511.0
7615,WHOLE COUNTRY,TOTAL,2022,,,,5961.0,1100,15411,19557,...,225310.0,118975.0,341125.0,222150.0,205631.0,596017.0,390386.0,1450321.0,5494815.0,4044494.0


In [None]:
import numpy as np

def fill_region_totals(df, total_value="TOTAL"):
    # Identify numeric columns to sum
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    # Exclude group columns and municipality column from numeric columns
    numeric_cols = [col for col in numeric_cols if col not in ["Region", "Year", "Municipality"]]

    # For each region and year, fill missing TOTALs
    for (region, year), group in df.groupby(["Region", "Year"]):
        # Get the index of the TOTAL row (if it exists)
        mask_total = (
            (df["Region"] == region) &
            (df["Year"] == year) &
            (df["Municipality"] == total_value)
        )
        if mask_total.any():
            idx_total = df[mask_total].index[0]
            # Only fill if all values are missing (NaN/None) for each column
            for col in numeric_cols:
                if pd.isna(df.at[idx_total, col]):
                    # Sum all municipality values for this region/year/column
                    vals = group[group["Municipality"] != total_value][col]
                    summed = vals.sum(skipna=True)
                    # Only fill if there is at least one non-missing value
                    if not vals.isna().all():
                        df.at[idx_total, col] = summed
    return df

final_df = fill_region_totals(final_df)
final_df

Unnamed: 0,Region,Municipality,Year,GDP per capita (euro at current prices),Imports (euro),Exports (euro),"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)","Gross value added (millions of euro), B Mining and quarrying (05-09)","Gross value added (millions of euro), F Construction (41-43)","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles (45-47)",...,65 - 69 Urban (population),70 - 74 Rural (population),70 - 74 Total (population),70 - 74 Urban (population),75- Rural (population),75- Total (population),75- Urban (population),Total Rural (population),Total (population),Total Urban (population)
3,01,018,2003,,,,,,,,...,,,,,,,,,,
30,01,018,2004,,,,,,,,...,,,,,,,,,,
57,01,018,2005,,,,,,,,...,,,,,,,,,,
84,01,018,2006,,,,,,,,...,,,,,,,,,,
111,01,018,2007,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7508,21,TOTAL,2020,39532.8,5.976800e+10,5.744000e+10,31.3,1.5,89.4,63.4,...,0.0,1859.0,1859.0,0.0,3044.0,3044.0,0.0,29646.0,29646.0,0.0
7525,21,TOTAL,2021,44250.6,7.293600e+10,6.890800e+10,29.3,1.7,94.3,73.5,...,0.0,1828.0,1828.0,0.0,3235.0,3235.0,0.0,29832.0,29832.0,0.0
7542,21,TOTAL,2022,48912.8,9.246900e+10,8.188500e+10,21.7,1.7,83,76.5,...,0.0,1836.0,1836.0,0.0,3405.0,3405.0,0.0,29902.0,29902.0,0.0
7559,21,TOTAL,2023,,7.684900e+10,7.636200e+10,26.4,,,,...,0.0,1871.0,1871.0,0.0,3570.0,3570.0,0.0,30075.0,30075.0,0.0


In [73]:
# save final df to csv
output_path = notebook_path / "data" / "final_economic_data.csv"
final_df.to_csv(output_path, index=False)

### Actually final formatting (merging Ignacio's dataset that has been supplemented w Finnish data)

In [43]:
final_df_path = notebook_path / "data" / "final_economic_data.csv"
enterprise_path = notebook_path / "data" / "enterprises_bankrupcies_investments.csv"

final_df = pd.read_csv(final_df_path, dtype={"Region": str, "Municipality": str, "Year": str})
enterprise_df = pd.read_csv(enterprise_path, dtype={"Region": str, "Municipality": str, "Year": str})

final_df


Unnamed: 0,Region,Municipality,Year,GDP per capita (euro),Imports (euro),Exports (euro),"Gross value added (millions of euro), A Agriculture, forestry and fishing","Gross value added (millions of euro), B Mining and quarrying","Gross value added (millions of euro), F Construction","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles",...,65 - 69 Urban (population),70 - 74 Rural (population),70 - 74 Total (population),70 - 74 Urban (population),75- Rural (population),75- Total (population),75- Urban (population),Total Rural (population),Total (population),Total Urban (population)
0,01,018,2003,,,,,,,,...,,,,,,,,,,
1,01,018,2004,,,,,,,,...,,,,,,,,,,
2,01,018,2005,,,,,,,,...,,,,,,,,,,
3,01,018,2006,,,,,,,,...,,,,,,,,,,
4,01,018,2007,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7588,21,TOTAL,2020,39532.8,5.976800e+10,5.744000e+10,31.3,1.5,89.4,63.4,...,0.0,1859.0,1859.0,0.0,3044.0,3044.0,0.0,29646.0,29646.0,0.0
7589,21,TOTAL,2021,44250.6,7.293600e+10,6.890800e+10,29.3,1.7,94.3,73.5,...,0.0,1828.0,1828.0,0.0,3235.0,3235.0,0.0,29832.0,29832.0,0.0
7590,21,TOTAL,2022,48912.8,9.246900e+10,8.188500e+10,21.7,1.7,83.0,76.5,...,0.0,1836.0,1836.0,0.0,3405.0,3405.0,0.0,29902.0,29902.0,0.0
7591,21,TOTAL,2023,,7.684900e+10,7.636200e+10,26.4,,,,...,0.0,1871.0,1871.0,0.0,3570.0,3570.0,0.0,30075.0,30075.0,0.0


In [4]:
enterprise_df

Unnamed: 0,Region,Municipality,Year,"A Agriculture, forestry and fishing Net opening of enterprises (unit)",B Mining and quarrying Net opening of enterprises (unit),C Manufacturing Net opening of enterprises (unit),"D Electricity, gas, steam and air conditioning supply Net opening of enterprises (unit)","E Water supply; sewerage, waste management and remediation activities Net opening of enterprises (unit)",F Construction Net opening of enterprises (unit),G Wholesale and retail trade; repair of motor vehicles and motorcycles Net opening of enterprises (unit),...,N Administrative and support service activities Investments (mln of euro),O Public administration and defence; compulsory social security Investments (mln of euro),P Education Investments (mln of euro),Q Human health and social work activities Investments (mln of euro),T Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use Investments (mln of euro),C Manufacturing Investments (mln of euro),D Electricity Investments (mln of euro),E Water Supply Investments (mln of euro),R Human health and social work activities Investments (mln of euro),"S Arts, sports and recreation Investments (mln of euro)"
0,01,018,2003,,,,,,,,...,,,,,,,,,,
1,01,018,2004,,,,,,,,...,,,,,,,,,,
2,01,018,2005,,,,,,,,...,,,,,,,,,,
3,01,018,2006,,,,,,,,...,,,,,,,,,,
4,01,018,2007,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12955,21,TOTAL,2020,3.0,0.0,8.0,1.0,1.0,12.0,2.0,...,,,,,,,,,,
12956,21,TOTAL,2021,16.0,-1.0,4.0,2.0,-1.0,18.0,2.0,...,,,,,,,,,,
12957,21,TOTAL,2022,6.0,0.0,2.0,7.0,0.0,-1.0,2.0,...,,,,,,,,,,
12958,21,TOTAL,2023,8.0,0.0,4.0,9.0,0.0,-5.0,-1.0,...,,,,,,,,,,


In [7]:
# remove identical rows
enterprise_df = enterprise_df.drop_duplicates()
enterprise_df

Unnamed: 0,Region,Municipality,Year,"A Agriculture, forestry and fishing Net opening of enterprises (unit)",B Mining and quarrying Net opening of enterprises (unit),C Manufacturing Net opening of enterprises (unit),"D Electricity, gas, steam and air conditioning supply Net opening of enterprises (unit)","E Water supply; sewerage, waste management and remediation activities Net opening of enterprises (unit)",F Construction Net opening of enterprises (unit),G Wholesale and retail trade; repair of motor vehicles and motorcycles Net opening of enterprises (unit),...,N Administrative and support service activities Investments (mln of euro),O Public administration and defence; compulsory social security Investments (mln of euro),P Education Investments (mln of euro),Q Human health and social work activities Investments (mln of euro),T Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use Investments (mln of euro),C Manufacturing Investments (mln of euro),D Electricity Investments (mln of euro),E Water Supply Investments (mln of euro),R Human health and social work activities Investments (mln of euro),"S Arts, sports and recreation Investments (mln of euro)"
0,01,018,2003,,,,,,,,...,,,,,,,,,,
1,01,018,2004,,,,,,,,...,,,,,,,,,,
2,01,018,2005,,,,,,,,...,,,,,,,,,,
3,01,018,2006,,,,,,,,...,,,,,,,,,,
4,01,018,2007,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12955,21,TOTAL,2020,3.0,0.0,8.0,1.0,1.0,12.0,2.0,...,,,,,,,,,,
12956,21,TOTAL,2021,16.0,-1.0,4.0,2.0,-1.0,18.0,2.0,...,,,,,,,,,,
12957,21,TOTAL,2022,6.0,0.0,2.0,7.0,0.0,-1.0,2.0,...,,,,,,,,,,
12958,21,TOTAL,2023,8.0,0.0,4.0,9.0,0.0,-5.0,-1.0,...,,,,,,,,,,


Can't merge this with the final dataset. There are repeat years for municipalities (e.g. 01, 018 has each year following 2013 four times). Dropping duplicate rows drops 750 rows, but does not entirely resolve the issue above due to differing values across the same year that is being repeated. For example, year 2013 for municipality 018, region 01, has values 28.00 and 29.00 for column "C Manufacturing Stock of Enterprises (unit)"

In [5]:
final_df = merge_with_final(enterprise_df, final_df)
final_df

Unnamed: 0,Region,Municipality,Year,GDP per capita (euro),Imports (euro),Exports (euro),"Gross value added (millions of euro), A Agriculture, forestry and fishing","Gross value added (millions of euro), B Mining and quarrying","Gross value added (millions of euro), F Construction","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles",...,N Administrative and support service activities Investments (mln of euro),O Public administration and defence; compulsory social security Investments (mln of euro),P Education Investments (mln of euro),Q Human health and social work activities Investments (mln of euro),T Activities of households as employers; undifferentiated goods- and services-producing activities of households for own use Investments (mln of euro),C Manufacturing Investments (mln of euro),D Electricity Investments (mln of euro),E Water Supply Investments (mln of euro),R Human health and social work activities Investments (mln of euro),"S Arts, sports and recreation Investments (mln of euro)"
0,01,TOTAL,2000,36270.1,,,255.5,40.8,2501.6,5985.4,...,193.9,840.2,544.6,251.5,0.0,3291.7,172.65,172.65,125.20,125.20
1,01,TOTAL,2001,38454.8,,,235.4,43.4,2766.6,6491.5,...,211.2,813.4,601.5,262.4,0.0,3177.3,161.15,161.15,117.60,117.60
2,01,TOTAL,2002,38552.7,,,246.1,39.1,2449.9,6530.9,...,210.0,887.4,678.7,282.6,0.0,2779.8,187.75,187.75,120.65,120.65
3,01,018,2003,,,,,,,,...,,,,,,,,,,
4,01,049,2003,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13300,21,478,2025,,,,,,,,...,,,,,,,,,,
13301,21,736,2025,,,,,,,,...,,,,,,,,,,
13302,21,766,2025,,,,,,,,...,,,,,,,,,,
13303,21,771,2025,,,,,,,,...,,,,,,,,,,


In [None]:
# feature selection??

### Add missing industries to final dataset

In [32]:
import pandas as pd
from pathlib import Path

notebook_path = Path().resolve()
final_path = notebook_path / "data" / "final_economic_data.csv"
regional_path = notebook_path / "data" / "regional_economic_data.csv"
remaining_industries_path = notebook_path / "data" / "remaining_industries.xlsx"

final_df = pd.read_csv(final_path, dtype={"Region": str, "Municipality": str, "Year": int})
regional_df = pd.read_csv(regional_path, dtype={"Region": str, "Municipality": str, "Year": int})
# get only columns Region, Year, and those with "Gross value added" in their name
remaining_industries_df = pd.read_excel(remaining_industries_path, dtype={"Region": str, "Year": int})
remaining_industries_df = remaining_industries_df[["Region", "Year"] + [col for col in remaining_industries_df.columns if "Gross value added" in col]]

remaining_industries_df

Unnamed: 0,Region,Year,"Gross value added (millions of euro), C Manufacturing (10-33)","Gross value added (millions of euro), D, E Electricity, gas, steam and air conditioning and water supply; sewerage and waste management (35-39)","Gross value added (millions of euro), L Real estate activities"
0,MK01 Uusimaa,2000,15598.9,964.9,4110.6
1,,2001,16651.9,1049.8,4309.0
2,,2002,15138.7,1145.8,4661.6
3,,2003,15059.6,1354.2,4864.7
4,,2004,15142.6,1343.7,4920.7
...,...,...,...,...,...
424,,2018,94.4,29.2,128.8
425,,2019,111.7,27.2,136.4
426,,2020,109.5,31.8,134.4
427,,2021,132.4,29.2,133.0


In [29]:
from data_cleaning import split_code_from_name, ffill_col

reminaining_industries_df = ffill_col(remaining_industries_df, "Region")
remaining_industries_df = split_code_from_name(remaining_industries_df, "Region", code_length=2)
remaining_industries_df

Unnamed: 0,Region,Year,"Gross value added (millions of euro), C Manufacturing (10-33)","Gross value added (millions of euro), D, E Electricity, gas, steam and air conditioning and water supply; sewerage and waste management (35-39)","Gross value added (millions of euro), L Real estate activities"
0,01,2000,15598.9,964.9,4110.6
1,01,2001,16651.9,1049.8,4309.0
2,01,2002,15138.7,1145.8,4661.6
3,01,2003,15059.6,1354.2,4864.7
4,01,2004,15142.6,1343.7,4920.7
...,...,...,...,...,...
424,21,2018,94.4,29.2,128.8
425,21,2019,111.7,27.2,136.4
426,21,2020,109.5,31.8,134.4
427,21,2021,132.4,29.2,133.0


In [36]:
from data_cleaning import pad_code
regional_df["Region"] = pad_code(regional_df, "Region", code_length=2)
regional_df

Unnamed: 0,Region,Year,GDP per capita (euro),Imports (euro),Exports (euro),"Gross value added (millions of euro), A Agriculture, forestry and fishing","Gross value added (millions of euro), B Mining and quarrying","Gross value added (millions of euro), F Construction","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles","Gross value added (millions of euro), H Transportation and storage",...,"Other current transfers, expenditure (euro)","Disposable income, net (euro)","Academic degree, higher university level degree (population)","Academic degree, lowest level tertiary and lower university level degrees (population)","Aged 18 or over, total (population)",Basic level studies (population),Matriculation examination (population),Vocational diploma (population),"With education, total (population)",Total (population)
0,01,2000,36270.1,,,255.5,40.8,2501.6,5985.4,3248.8,...,676.0,21447.0,,,,,,,,
1,01,2001,38454.8,,,235.4,43.4,2766.6,6491.5,3325.1,...,734.0,21341.0,,,,,,,,
2,01,2002,38552.7,,,246.1,39.1,2449.9,6530.9,3381.9,...,745.0,22305.0,,,,,,,,
3,01,2003,38643.2,,,235.6,34.6,2567.0,6380.4,3269.9,...,745.0,23552.0,,,,,,,,
4,01,2004,40312.2,,,233.9,42.8,2754.4,6842.6,3266.5,...,766.0,24999.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
470,21,2020,39532.8,5.976800e+10,5.744000e+10,31.3,1.5,89.4,63.4,103.7,...,37.0,744.0,1961.0,5302.0,23764.0,6533.0,1165.0,8747.0,17175.0,29646.0
471,21,2021,44250.6,7.293600e+10,6.890800e+10,29.3,1.7,94.3,73.5,137.5,...,42.0,812.0,2036.0,5405.0,23907.0,6576.0,1141.0,8723.0,17305.0,29832.0
472,21,2022,48912.8,9.246900e+10,8.188500e+10,21.7,1.7,83.0,76.5,218.9,...,33.0,826.0,2049.0,5427.0,24020.0,6601.0,1194.0,8728.0,17398.0,29902.0
473,21,2023,,7.684900e+10,7.636200e+10,26.4,,,,,...,,,2296.0,5690.0,24212.0,6164.0,1114.0,8928.0,18028.0,30075.0


In [37]:
from data_cleaning import merge_with_final

# Region should be int, Year int
remaining_industries_df["Region"] = remaining_industries_df["Region"].astype(str)
remaining_industries_df["Year"] = remaining_industries_df["Year"].astype(int)
# merge remaining_industries_df with regional_df, save to regional_path
regional_df = pd.merge(
    left=regional_df,
    right=remaining_industries_df,
    how="outer",
    on=["Region", "Year"]
)
regional_df

Unnamed: 0,Region,Year,GDP per capita (euro),Imports (euro),Exports (euro),"Gross value added (millions of euro), A Agriculture, forestry and fishing","Gross value added (millions of euro), B Mining and quarrying","Gross value added (millions of euro), F Construction","Gross value added (millions of euro), G Wholesale and retail trade; repair of motor vehicles and motorcycles","Gross value added (millions of euro), H Transportation and storage",...,"Academic degree, lowest level tertiary and lower university level degrees (population)","Aged 18 or over, total (population)",Basic level studies (population),Matriculation examination (population),Vocational diploma (population),"With education, total (population)",Total (population),"Gross value added (millions of euro), C Manufacturing (10-33)","Gross value added (millions of euro), D, E Electricity, gas, steam and air conditioning and water supply; sewerage and waste management (35-39)","Gross value added (millions of euro), L Real estate activities"
0,01,2000,36270.1,,,255.5,40.8,2501.6,5985.4,3248.8,...,,,,,,,,,,
1,01,2001,38454.8,,,235.4,43.4,2766.6,6491.5,3325.1,...,,,,,,,,,,
2,01,2002,38552.7,,,246.1,39.1,2449.9,6530.9,3381.9,...,,,,,,,,,,
3,01,2003,38643.2,,,235.6,34.6,2567.0,6380.4,3269.9,...,,,,,,,,,,
4,01,2004,40312.2,,,233.9,42.8,2754.4,6842.6,3266.5,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899,,2022,,,,,,,,,...,,,,,,,,852.3,112.1,307.5
900,,2022,,,,,,,,,...,,,,,,,,4966.4,645.7,1980.4
901,,2022,,,,,,,,,...,,,,,,,,435.8,216.5,331.7
902,,2022,,,,,,,,,...,,,,,,,,2334.6,402.8,862.5


In [39]:
# save to csv
regional_df.to_csv(regional_path, index=False)

In [41]:
# merge remaining_industries_df with final_df, save to final_path
remaining_industries_df["Municipality"] = "TOTAL"
final_df = merge_with_final(final_df, remaining_industries_df)
final_df

Unnamed: 0,Region,Year,"Gross value added (millions of euro), C Manufacturing (10-33)","Gross value added (millions of euro), D, E Electricity, gas, steam and air conditioning and water supply; sewerage and waste management (35-39)","Gross value added (millions of euro), L Real estate activities",Municipality,GDP per capita (euro at current prices),Imports (euro),Exports (euro),"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)",...,65 - 69 Urban (population),70 - 74 Rural (population),70 - 74 Total (population),70 - 74 Urban (population),75- Rural (population),75- Total (population),75- Urban (population),Total Rural (population),Total (population),Total Urban (population)
0,01,2000,,,,TOTAL,36270.1,,,255.5,...,,,,,,,,,,
1,01,2001,,,,TOTAL,38454.8,,,235.4,...,,,,,,,,,,
2,01,2002,,,,TOTAL,38552.7,,,246.1,...,,,,,,,,,,
3,01,2003,,,,018,,,,,...,,,,,,,,,,
4,01,2003,,,,049,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8017,,2022,852.3,112.1,307.5,TOTAL,,,,,...,,,,,,,,,,
8018,,2022,4966.4,645.7,1980.4,TOTAL,,,,,...,,,,,,,,,,
8019,,2022,435.8,216.5,331.7,TOTAL,,,,,...,,,,,,,,,,
8020,,2022,2334.6,402.8,862.5,TOTAL,,,,,...,,,,,,,,,,


In [43]:
# sort final_df correctly so within each region, Years are in order and municipalities are sorted by code (ending with "TOTAL")
final_df = final_df.sort_values(["Region", "Year", "Municipality"])
final_df

Unnamed: 0,Region,Year,"Gross value added (millions of euro), C Manufacturing (10-33)","Gross value added (millions of euro), D, E Electricity, gas, steam and air conditioning and water supply; sewerage and waste management (35-39)","Gross value added (millions of euro), L Real estate activities",Municipality,GDP per capita (euro at current prices),Imports (euro),Exports (euro),"Gross value added (millions of euro), A Agriculture, forestry and fishing (01-03)",...,65 - 69 Urban (population),70 - 74 Rural (population),70 - 74 Total (population),70 - 74 Urban (population),75- Rural (population),75- Total (population),75- Urban (population),Total Rural (population),Total (population),Total Urban (population)
0,01,2000,,,,TOTAL,36270.1,,,255.5,...,,,,,,,,,,
1,01,2001,,,,TOTAL,38454.8,,,235.4,...,,,,,,,,,,
2,01,2002,,,,TOTAL,38552.7,,,246.1,...,,,,,,,,,,
3,01,2003,,,,018,,,,,...,,,,,,,,,,
4,01,2003,,,,049,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8017,,2022,852.3,112.1,307.5,TOTAL,,,,,...,,,,,,,,,,
8018,,2022,4966.4,645.7,1980.4,TOTAL,,,,,...,,,,,,,,,,
8019,,2022,435.8,216.5,331.7,TOTAL,,,,,...,,,,,,,,,,
8020,,2022,2334.6,402.8,862.5,TOTAL,,,,,...,,,,,,,,,,


In [44]:
final_df.to_csv(final_path, index=False)