### Data by NACE curation


In [None]:
import os
import pandas as pd
from datetime import datetime

### PPI by NACE (industrial)

source: CZSO

- yearly, base 2015
- contains industry (B,C,D,E), and levels 1, 2, 3

In [None]:
# Define file paths
script_dir = os.getcwd()  # Current directory in Jupyter
project_root = os.path.abspath(os.path.join(script_dir, ".."))
input_file = os.path.join(project_root, "data", "source_raw", "NACE", "ipccr031725_21_CSU_PPI by NACE.xlsx")

# Read the Excel file
df_ppi_by_nace = pd.read_excel(input_file, sheet_name="IR15 roční (yearly)", header=1)

# delete last two rows (notes)
df_ppi_by_nace = df_ppi_by_nace.iloc[:-2]

# transform code column: 
# replace "B,C,D,E" -> "industry"
df_ppi_by_nace["Code"] = df_ppi_by_nace["Code"].replace({"B,C,D,E": "industry"})

# Unnamed: 2 to name_cs, Unnamed: 3 to name_en
df_ppi_by_nace.rename(columns={"Unnamed: 2": "name_cs", "Unnamed: 3": "name_en"}, inplace=True)

# deduct 1 from Level
df_ppi_by_nace["Level"] = df_ppi_by_nace["Level"] - 1
# rename to level
df_ppi_by_nace.rename(columns={"Level": "level"}, inplace=True)

# rename Code to czso_code: consistency with other datasets
df_ppi_by_nace.rename(columns={"Code": "czso_code"}, inplace=True)

# transform to tidy format
df_ppi_by_nace = df_ppi_by_nace.melt(id_vars=["czso_code", "name_cs", "name_en", "level"], var_name="year", value_name="value")

# drop pre 2000 values
df_ppi_by_nace = df_ppi_by_nace[df_ppi_by_nace["year"].astype(int) >= 2000]

# add metric: ppi_by_nace
df_ppi_by_nace["metric"] = "ppi_by_nace"
# add unit: index
df_ppi_by_nace["unit"] = "2015=100"

# replace i.d. and : by null in value col (: = missing value, i.d. = individual data)
df_ppi_by_nace["value"] = df_ppi_by_nace["value"].replace({"i.d." : None, ":": None})

# data type conversion
df_ppi_by_nace["value"] = df_ppi_by_nace["value"].astype(float)
df_ppi_by_nace["year"] = df_ppi_by_nace["year"].astype(int)
df_ppi_by_nace["czso_code"] = df_ppi_by_nace["czso_code"].astype(str)
df_ppi_by_nace["level"] = df_ppi_by_nace["level"].astype(int)
df_ppi_by_nace["name_cs"] = df_ppi_by_nace["name_cs"].astype(str)
df_ppi_by_nace["name_en"] = df_ppi_by_nace["name_en"].astype(str)
df_ppi_by_nace["metric"] = df_ppi_by_nace["metric"].astype(str)
df_ppi_by_nace["unit"] = df_ppi_by_nace["unit"].astype(str)

# source
df_ppi_by_nace["source"] = "CZSO"




### Wages by NACE

source: CZSO
unit: Average gross monthly wage by activity of CZ-NACE
CZK, per full-time equivalent employee

- 2023 and 2024 preliminary data 

- we use "Q1 - Q4" data for the annual number 

-  The data refer only to the employees with an employment contract with the reporting units. Excludes persons performing public office, such as Members of Parliament, Senators, full-time councillors at all levels, judges, etc. The average wages refer to wages accounted for payment in the given period. 

- industry assigned level 0

In [None]:
# Define file paths
input_file = os.path.join(project_root, "data", "source_raw", "NACE", "pmzcr030625_2_wages by NACE.xlsx")

# Read the Excel file
df_wages = pd.read_excel(input_file, sheet_name="List1", header=4)

# remove first two rows 
df_wages = df_wages.iloc[2:]

# remove last three rows (notes)
df_wages = df_wages.iloc[:-3]

# rename columns: 
df_wages.rename(columns={"Unnamed: 0": "czso_code", "Unnamed: 1": "name"}, inplace=True)

# keep only columns czso_code, name_cs and those that start with "Q1-Q4"
df_wages = df_wages[["czso_code", "name"] + [col for col in df_wages.columns if col.startswith("Q1-Q4")]]
# rename the columns from Q1-Q4	Q1-Q4.1	Q1-Q4.2	Q1-Q4.3	Q1-Q4.4 to 2000, 2001, 2002, 2003, 2004, ...
df_wages.columns = ["czso_code", "name"] + [str(year) for year in range(2000, 2025)]

# in czso_code replace  value that STARTS with "B+C+D+E" only with "industry"
df_wages.loc[df_wages['czso_code'].str.startswith("B+C+D+E", na=False), 'czso_code'] = "industry"
# for industry fill in name
df_wages.loc[df_wages['czso_code'] == "industry", 'name'] = "Průmysl\nIndustry"

# separate name into name_cs and name_en (separated by \n)
df_wages[['name_cs', 'name_en']] = df_wages['name'].str.split('\n', expand=True)
# remove name column
df_wages.drop(columns=["name"], inplace=True)

# add level column: one letter: 1, industry = 0
df_wages["level"] = df_wages["czso_code"].apply(lambda x: 0 if x == "industry" else 1)

# transform to tidy format
df_wages = df_wages.melt(id_vars=["czso_code", "name_cs", "name_en", "level"], var_name="year", value_name="value")

# unit avg_gross_monthly_per_fulltime
df_wages["unit"] = "CZK_avg_gross_monthly_per_fulltime"
# metric wages_by_nace
df_wages["metric"] = "avg_wages_by_nace"

# data type conversion
df_wages["value"] = df_wages["value"].astype(float)
df_wages["year"] = df_wages["year"].astype(int)
df_wages["czso_code"] = df_wages["czso_code"].astype(str)
df_wages["level"] = df_wages["level"].astype(int)
df_wages["name_cs"] = df_wages["name_cs"].astype(str)
df_wages["name_en"] = df_wages["name_en"].astype(str)
df_wages["metric"] = df_wages["metric"].astype(str)
df_wages["unit"] = df_wages["unit"].astype(str)

# source
df_wages["source"] = "CZSO"




### avg number of employees by NACE

source: CZSO
- thousands of employees or full-time equivalents
- 2023 and 2024 preliminary data

- manipulation similar to wages 


In [None]:
# Define file paths
input_file = os.path.join(project_root, "data", "source_raw", "NACE", "pmzcr030625_3_csu avg number of employees by nace.xlsx")

# Read the Excel file
df_employees = pd.read_excel(input_file, sheet_name="List1", header=4)

# remove first two rows 
df_employees = df_employees.iloc[2:]

# remove last three rows (notes)
df_employees = df_employees.iloc[:-3]

# rename columns: 
df_employees.rename(columns={"Unnamed: 0": "czso_code", "Unnamed: 1": "name"}, inplace=True)

# keep only columns czso_code, name_cs and those that start with "Q1-Q4"
df_employees = df_employees[["czso_code", "name"] + [col for col in df_employees.columns if col.startswith("Q1-Q4")]]
# rename the columns from Q1-Q4	Q1-Q4.1	Q1-Q4.2	Q1-Q4.3	Q1-Q4.4 to 2000, 2001, 2002, 2003, 2004, ...
df_employees.columns = ["czso_code", "name"] + [str(year) for year in range(2000, 2025)]

# in czso_code replace  value that STARTS with "B+C+D+E" only with "industry"
df_employees.loc[df_employees['czso_code'].str.startswith("B+C+D+E", na=False), 'czso_code'] = "industry"
# for industry fill in name
df_employees.loc[df_employees['czso_code'] == "industry", 'name'] = "Průmysl\nIndustry"

# separate name into name_cs and name_en (separated by \n)
df_employees[['name_cs', 'name_en']] = df_employees['name'].str.split('\n', expand=True)
# remove name column
df_employees.drop(columns=["name"], inplace=True)

# add level column: one letter: 1, industry = 0
df_employees["level"] = df_employees["czso_code"].apply(lambda x: 0 if x == "industry" else 1)

# transform to tidy format
df_employees = df_employees.melt(id_vars=["czso_code", "name_cs", "name_en", "level"], var_name="year", value_name="value")

# unit
df_employees["unit"] = "ths"
# metric
df_employees["metric"] = "no_of_employees_by_nace"


# data type conversion
df_employees["value"] = df_employees["value"].astype(float)
df_employees["year"] = df_employees["year"].astype(int)
df_employees["czso_code"] = df_employees["czso_code"].astype(str)
df_employees["level"] = df_employees["level"].astype(int)
df_employees["name_cs"] = df_employees["name_cs"].astype(str)
df_employees["name_en"] = df_employees["name_en"].astype(str)
df_employees["metric"] = df_employees["metric"].astype(str)
df_employees["unit"] = df_employees["unit"].astype(str)

# source
df_employees["source"] = "CZSO"

In [None]:
# combine and save to parquet
df_combined = pd.concat([df_ppi_by_nace, df_wages, df_employees], ignore_index=True)
# change order
df_combined = df_combined[["czso_code", "level", "name_cs", "name_en", "year", "metric", "value", "unit", "source"]]


In [None]:
# save to parquet
output_folder = os.path.join(project_root, "data", "source_cleaned")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_file = os.path.join(output_folder, "data_by_nace_annual_tidy.parquet")

df_combined.to_parquet(output_file, index=False, engine="pyarrow")
print(f"Data saved to {output_file}")
