### Data by NACE curation


In [1]:
import os
import pandas as pd
from datetime import datetime

### PPI by NACE 

source: CZSO

- yearly, base 2015
- contains industry (B,C,D,E), and levels 1, 2, 3

In [43]:
# Define file paths
script_dir = os.getcwd()  # Current directory in Jupyter
project_root = os.path.abspath(os.path.join(script_dir, ".."))
input_file = os.path.join(project_root, "data", "source_raw", "NACE", "ipccr031725_21_CSU_PPI by NACE.xlsx")
output_folder = os.path.join(project_root, "data", "source_cleaned")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_file = os.path.join(output_folder, "ppi_by_nace_annual.parquet")

# Read the Excel file
df_ppi_by_nace = pd.read_excel(input_file, sheet_name="IR15 roční (yearly)", header=1)

# delete last two rows (notes)
df_ppi_by_nace = df_ppi_by_nace.iloc[:-2]

# transform code column: 
# replace "B,C,D,E" -> "industry"
df_ppi_by_nace["Code"] = df_ppi_by_nace["Code"].replace({"B,C,D,E": "industry"})

# Unnamed: 2 to name_cs, Unnamed: 3 to name_en
df_ppi_by_nace.rename(columns={"Unnamed: 2": "name_cs", "Unnamed: 3": "name_en"}, inplace=True)

# deduct 1 from Level
df_ppi_by_nace["Level"] = df_ppi_by_nace["Level"] - 1
# rename to level
df_ppi_by_nace.rename(columns={"Level": "level"}, inplace=True)

# rename Code to czso_code: consistency with other datasets
df_ppi_by_nace.rename(columns={"Code": "czso_code"}, inplace=True)

# transform to tidy format
df_ppi_by_nace = df_ppi_by_nace.melt(id_vars=["czso_code", "name_cs", "name_en", "level"], var_name="year", value_name="value")

# drop pre 2000 values
df_ppi_by_nace = df_ppi_by_nace[df_ppi_by_nace["year"].astype(int) >= 2000]

# add metric: ppi_by_nace
df_ppi_by_nace["metric"] = "ppi_by_nace"
# add unit: index
df_ppi_by_nace["unit"] = "2015=100"

# replace i.d. and : by null in value col (: = missing value, i.d. = individual data)
df_ppi_by_nace["value"] = df_ppi_by_nace["value"].replace({"i.d." : None, ":": None})



In [44]:
df_ppi_by_nace

Unnamed: 0,czso_code,name_cs,name_en,level,year,value,metric,unit
1070,industry,ÚHRN,INDUSTRY - TOTAL,0,2000,79.9,ppi_by_nace,2015=100
1071,B,TĚŽBA A DOBÝVÁNÍ,MINING AND QUARRYING,1,2000,66.4,ppi_by_nace,2015=100
1072,05,Černé a hnědé uhlí a lignit,Coal and lignite,2,2000,63.6,ppi_by_nace,2015=100
1073,06,Ropa a zemní plyn,Crude petroleum and natural gas,2,2000,,ppi_by_nace,2015=100
1074,061,Ropa,Crude petroleum,3,2000,,ppi_by_nace,2015=100
...,...,...,...,...,...,...,...,...
3740,352,"Vyrobený plyn; rozvod plyn. paliv, obchod s pl...",Manufactured gas; distribution services of gas...,3,2024,160.8,ppi_by_nace,2015=100
3741,353,Pára a horká voda; dodávání páry a klimatiz. v...,Steam and air conditioning supply services,3,2024,214.9,ppi_by_nace,2015=100
3742,E,ZÁSOBOVÁNÍ VODOU; SLUŽBY SOUV. S ODPAD. VODAMI,"WATER SUPPLY; SEWERAGE, WASTE MANAGEMENT AND R...",1,2024,161.9,ppi_by_nace,2015=100
3743,36,"Přír. voda; úprava a rozvod vody, obchod s vod...",Natural water; water treatment and supply serv...,2,2024,161.9,ppi_by_nace,2015=100


### Wages by NACE

source: CZSO
unit: Average gross monthly wage by activity of CZ-NACE
CZK, per full-time equivalent employee

- 2023 and 2024 preliminary data 

-  The data refer only to the employees with an employment contract with the reporting units. Excludes persons performing public office, such as Members of Parliament, Senators, full-time councillors at all levels, judges, etc. The average wages refer to wages accounted for payment in the given period. 																					

In [70]:
# Define file paths
input_file = os.path.join(project_root, "data", "source_raw", "NACE", "pmzcr030625_2_wages by NACE.xlsx")

# Read the Excel file
df_wages = pd.read_excel(input_file, sheet_name="List1", header=4)

# remove first two rows 
df_wages = df_wages.iloc[2:]

# remove last three rows (notes)
df_wages = df_wages.iloc[:-3]

# rename columns: 
df_wages.rename(columns={"Unnamed: 0": "czso_code", "Unnamed: 1": "name_cs"}, inplace=True)

# keep only columns czso_code, name_cs and those that start with "Q1-Q4"
df_wages = df_wages[["czso_code", "name_cs"] + [col for col in df_wages.columns if col.startswith("Q1-Q4")]]
# rename the columns from Q1-Q4	Q1-Q4.1	Q1-Q4.2	Q1-Q4.3	Q1-Q4.4 to 2000, 2001, 2002, 2003, 2004, ...
df_wages.columns = ["czso_code", "name_cs"] + [str(year) for year in range(2000, 2025)]

# in czso_code replace  value that STARTS with "B+C+D+E" only with "industry"







In [71]:
df_wages

Unnamed: 0,czso_code,name_cs,2000,2001,2002,2003,2004,2005,2006,2007,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
2,A,"Zemědělství, lesnictví a rybářství\nAgricultur...",10456.0,11447.0,11813.0,12188.0,13244.0,13961.0,14838.0,16194.0,...,21668.0,22634.0,23831.0,25486.0,28044.0,29138.0,30337.0,31342.0,33768.0,35694.0
3,"*industry*\nB+C+D+E Industry, t o t a l",,13234.0,14153.0,15081.0,15850.0,17021.0,17837.0,18977.0,20311.0,...,26857.0,28000.0,29910.0,32227.0,34364.0,35061.0,36853.0,39056.0,42667.0,45815.0
4,B,Těžba a dobývání\nMining and quarrying,16553.0,17743.0,18749.0,19688.0,21122.0,22679.0,24047.0,25714.0,...,31800.0,31602.0,33483.0,36028.0,37301.0,37320.0,39288.0,43025.0,47410.0,49829.0
5,C,Zpracovatelský průmysl\nManufacturing,12845.0,13761.0,14659.0,15410.0,16584.0,17362.0,18490.0,19852.0,...,26457.0,27676.0,29585.0,31893.0,34008.0,34597.0,36407.0,38537.0,41965.0,45071.0
6,D,"Výroba a rozvod elektřiny, plynu, tepla a klim...",18468.0,19833.0,21675.0,23084.0,24725.0,26594.0,29179.0,31157.0,...,40449.0,41432.0,43595.0,46375.0,49480.0,52509.0,54745.0,59251.0,67647.0,71480.0
7,E,Zásobování vodou; činnosti související s odpad...,13235.0,14071.0,15144.0,15788.0,16775.0,17703.0,18749.0,19750.0,...,24768.0,25394.0,26941.0,28736.0,30715.0,32264.0,33465.0,35268.0,38896.0,42304.0
8,F,Stavebnictví\nConstruction,12623.0,13537.0,14213.0,15203.0,16279.0,16808.0,17885.0,19036.0,...,23979.0,24944.0,25995.0,28193.0,30187.0,31442.0,32583.0,33617.0,36506.0,39659.0
9,G,Velkoobchod a maloobchod; opravy a údržba moto...,12570.0,13709.0,14778.0,15382.0,16302.0,17058.0,18238.0,19821.0,...,24911.0,26097.0,28040.0,29986.0,32321.0,33482.0,35783.0,37663.0,40145.0,43126.0
10,H,Doprava a skladování\nTransportation and storage,13368.0,14298.0,15417.0,16260.0,17350.0,18188.0,19262.0,20663.0,...,24657.0,25822.0,27438.0,29462.0,31626.0,32078.0,33606.0,35975.0,38941.0,42166.0
11,I,"Ubytování, stravování a pohostinství\nAccommod...",7526.0,8664.0,9586.0,9826.0,10183.0,10637.0,11676.0,12380.0,...,14845.0,15701.0,17480.0,19272.0,20934.0,20296.0,22032.0,23173.0,25360.0,27595.0


In [48]:
df_wages

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Q1,Q2,Q3,Q4,Q1-Q2,Q1-Q3,Q1-Q4,Q1.1,...,Q1-Q2.23,Q1-Q3.23,Q1-Q4.23,Q1.24,Q2.24,Q3.24,Q4.24,Q1-Q2.24,Q1-Q3.24,Q1-Q4.24
0,"Česká republika c e l k e m\nCzech Republic, ...",,11941.0,13227.0,12963.0,14717.0,12588.0,12714.0,13219.0,13052.0,...,42060.0,42186.0,43120.0,44047.0,45889.0,45464.0,49229.0,44969.0,45134.0,46165.0
1,v tom:,,,,,,,,,,...,,,,,,,,,,
2,A,"Zemědělství, lesnictví a rybářství\nAgricultur...",9017.0,10207.0,10889.0,11696.0,9623.0,10053.0,10456.0,9895.0,...,31721.0,32658.0,33768.0,32041.0,34734.0,36494.0,39473.0,33401.0,34439.0,35694.0
3,B+C+D+E Průmysl c e l k e m\nB+C+D+E Industry...,,12130.0,13182.0,13057.0,14532.0,12659.0,12793.0,13234.0,13056.0,...,41976.0,41868.0,42667.0,43883.0,46626.0,44600.0,48168.0,45252.0,45036.0,45815.0
4,B,Těžba a dobývání\nMining and quarrying,14810.0,16841.0,15993.0,18623.0,15823.0,15879.0,16553.0,15853.0,...,45483.0,45242.0,47410.0,48012.0,49452.0,47454.0,54593.0,48731.0,48318.0,49829.0
5,C,Zpracovatelský průmysl\nManufacturing,11827.0,12749.0,12754.0,14010.0,12290.0,12447.0,12845.0,12763.0,...,41240.0,41218.0,41965.0,42797.0,46177.0,44000.0,47331.0,44483.0,44323.0,45071.0
6,D,"Výroba a rozvod elektřiny, plynu, tepla a klim...",16158.0,18646.0,17343.0,21821.0,17393.0,17377.0,18468.0,17384.0,...,70196.0,67123.0,67647.0,81359.0,65722.0,66495.0,72424.0,73509.0,71161.0,71480.0
7,E,Zásobování vodou; činnosti související s odpad...,11803.0,13409.0,12609.0,15091.0,12616.0,12614.0,13235.0,12574.0,...,37282.0,37382.0,38896.0,39599.0,41956.0,41060.0,46512.0,40785.0,40877.0,42304.0
8,F,Stavebnictví\nConstruction,11087.0,12430.0,12856.0,14056.0,11775.0,12143.0,12623.0,11840.0,...,35168.0,35771.0,36506.0,36562.0,39253.0,40185.0,42531.0,37925.0,38686.0,39659.0
9,G,Velkoobchod a maloobchod; opravy a údržba moto...,11501.0,12372.0,12568.0,13772.0,11939.0,12152.0,12570.0,12651.0,...,39634.0,39584.0,40145.0,42350.0,42320.0,42610.0,45207.0,42335.0,42427.0,43126.0
