### Data by NACE curation


In [1]:
import os
import pandas as pd
from datetime import datetime

### PPI by NACE (industrial)

source: CZSO

- yearly, base 2015
- contains industry (B,C,D,E), and levels 1, 2, 3

In [2]:
# Define file paths
script_dir = os.getcwd()  # Current directory in Jupyter
project_root = os.path.abspath(os.path.join(script_dir, ".."))
input_file = os.path.join(project_root, "data", "source_raw", "NACE", "ipccr031725_21_CSU_PPI_by_NACE_industry.xlsx")

# Read the Excel file
df_ppi_by_nace_ind = pd.read_excel(input_file, sheet_name="IR15 roční (yearly)", header=1)

# delete last two rows (notes)
df_ppi_by_nace_ind = df_ppi_by_nace_ind.iloc[:-2]

# transform code column: 
# replace "B,C,D,E" -> "industry"
#df_ppi_by_nace_ind["Code"] = df_ppi_by_nace_ind["Code"].replace({"B,C,D,E": "industry"})

# Unnamed: 2 to name_cs, Unnamed: 3 to name_en
df_ppi_by_nace_ind.rename(columns={"Unnamed: 2": "name_cs", "Unnamed: 3": "name_en"}, inplace=True)

# deduct 1 from Level
df_ppi_by_nace_ind["Level"] = df_ppi_by_nace_ind["Level"] - 1
# rename to level
df_ppi_by_nace_ind.rename(columns={"Level": "level"}, inplace=True)

# rename Code to czso_code: consistency with other datasets
df_ppi_by_nace_ind.rename(columns={"Code": "czso_code"}, inplace=True)

# transform to tidy format
df_ppi_by_nace_ind = df_ppi_by_nace_ind.melt(id_vars=["czso_code", "name_cs", "name_en", "level"], var_name="year", value_name="value")

# drop pre 2000 values
df_ppi_by_nace_ind = df_ppi_by_nace_ind[df_ppi_by_nace_ind["year"].astype(int) >= 2000]

# add metric: ppi_by_nace
df_ppi_by_nace_ind["metric"] = "ppi_by_nace_industry"
# add unit: index
df_ppi_by_nace_ind["unit"] = "2015=100"

# replace i.d. and : by null in value col (: = missing value, i.d. = individual data)
df_ppi_by_nace_ind["value"] = df_ppi_by_nace_ind["value"].replace({"i.d." : None, ":": None})

# data type conversion
df_ppi_by_nace_ind["value"] = df_ppi_by_nace_ind["value"].astype(float)
df_ppi_by_nace_ind["year"] = df_ppi_by_nace_ind["year"].astype(int)
df_ppi_by_nace_ind["czso_code"] = df_ppi_by_nace_ind["czso_code"].astype(str)
df_ppi_by_nace_ind["level"] = df_ppi_by_nace_ind["level"].astype(int)
df_ppi_by_nace_ind["name_cs"] = df_ppi_by_nace_ind["name_cs"].astype(str)
df_ppi_by_nace_ind["name_en"] = df_ppi_by_nace_ind["name_en"].astype(str)
df_ppi_by_nace_ind["metric"] = df_ppi_by_nace_ind["metric"].astype(str)
df_ppi_by_nace_ind["unit"] = df_ppi_by_nace_ind["unit"].astype(str)

# source
df_ppi_by_nace_ind["source"] = "CZSO - industry"




### SPPI by NACE aggregated

source: CZSO https://csu.gov.cz/produkty/ipc_ts > Table 1.1

- yearly, base: 
   - 2020 for agriculture
   - 2015 for other industries
- contains SPPI (aggregated) for NACE sections

In [3]:
input_file = os.path.join(project_root, "data", "source_raw", "NACE", "ipccr052025_11_CSU_PPI_an_SPPI_by_NACE_aggregated.xlsx")

# Read the Excel file
df_ppi_by_nace_agg = pd.read_excel(input_file, sheet_name="List1", header=1)

# drop first 4 rows and anything after row 44, since we are interested only in yearly data
df_ppi_by_nace_agg = df_ppi_by_nace_agg.iloc[5:42]

# NACE matching table for SPPI data, in the correct order

df_nace_sppi_cols = pd.DataFrame([
    # name, level, czso_code
    ["Agricultural producer incl. fish", 0, "A"],
    ["Agricultural producer - Animals", 1, "014+015+017+031"],
    ["Agricultural producer - Crops", 1, "011+012+013"],
    ["Industrial producer", 0, "B+C+D+E"],
    ["Construction work", 0, "F"],
    ["Market services price", 0, "G+H+I+J+K+L+M+N"],
    ["Market services price excl. advertising services", 0, "G+H+I+J+K+L+M+N-731"],
    ["Land transport services and transport services via pipelines", 1, "49"],
    ["Water transport services", 1, "50"],
    ["Warehousing and support services for transportation", 1, "52"],
    ["Postal and courier services", 1, "53"],
    ["Publishing services", 1, "58"],
    ["Motion picture, video and television programme production, sound recording and music publishing", 1, "59"],
    ["Programming and broadcasting services", 1, "60"],
    ["Telecommunications services", 1, "61"],
    ["Computer programming, consultancy", 1, "62"],
    ["Information services", 1, "63"],
    ["Insurance", 1, "65"],
    ["Real estate services", 1, "68"],
    ["Legal and accounting services", 1, "69"],
    ["Services of head offices; management consulting services", 1, "70"],
    ["Architectural and engineering services", 1, "71"],
    ["Advertising and market research services", 1, "73"],
    ["Other professional, scientific, technic. services", 1, "74"],
    ["Rental and leasing services", 1, "77"],
    ["Employment services", 1, "78"],
    ["Security and investigation services", 1, "80"],
    ["Services to buildings and landscape", 1, "81"],
    ["Office administrative and other support services", 1, "82"],
], columns=["name_source_row", "level", "czso_code"])

# enrich with name_cs, name_en from t_nace_matching.parquet
nace_matching_file = os.path.join(project_root, "data", "source_cleaned", "t_nace_matching.parquet")
df_nace_matching = pd.read_parquet(nace_matching_file)

# merge with NACE matching table
df_nace_sppi_cols = pd.merge(
    df_nace_sppi_cols,
    df_nace_matching[["czso_code", "name_czso_cs", "name_czso_en"]],
    on="czso_code",
    how="left",
    suffixes=("", "_y")
)
# fill in missing name_czso_cs and name_czso_en with name_source_row
df_nace_sppi_cols["name_czso_cs"] = df_nace_sppi_cols["name_czso_cs"].fillna(df_nace_sppi_cols["name_source_row"])
df_nace_sppi_cols["name_czso_en"] = df_nace_sppi_cols["name_czso_en"].fillna(df_nace_sppi_cols["name_source_row"])

# add rows for czso_code, name_czso_cs, and name_czso_en to df_ppi_by_nace_agg on top
czso_code_row = ["year"] + df_nace_sppi_cols['czso_code'].tolist()
df_czso_code_row = pd.DataFrame([czso_code_row], columns=df_ppi_by_nace_agg.columns)

df_ppi_by_nace_agg = pd.concat([df_czso_code_row, df_ppi_by_nace_agg], ignore_index=True)
# drop original identifier rows
df_ppi_by_nace_agg.drop(index=[1, 2], inplace=True)

# first row as header
df_ppi_by_nace_agg.columns = df_ppi_by_nace_agg.iloc[0]
df_ppi_by_nace_agg = df_ppi_by_nace_agg[1:]

# transform to tidy format
df_ppi_by_nace_agg = df_ppi_by_nace_agg.melt(id_vars=["year"], var_name="czso_code", value_name="value")

# enrich with name_cs, name_en, level from df_nace_sppi_cols
df_ppi_by_nace_agg = pd.merge(
    df_ppi_by_nace_agg,
    df_nace_sppi_cols[["czso_code", "name_czso_cs", "name_czso_en", "level"]],
    on="czso_code",
    how="left"
)

# rename columns
df_ppi_by_nace_agg.rename(columns={
    "name_czso_cs": "name_cs",
    "name_czso_en": "name_en"
}, inplace=True)

# add metric: ppi_by_nace
df_ppi_by_nace_agg["metric"] = "ppi_by_nace_aggregated"

# add source
df_ppi_by_nace_agg["source"] = "CZSO - without industry"

# Data type conversion for year and value before rebasing
df_ppi_by_nace_agg["year"] = pd.to_numeric(df_ppi_by_nace_agg["year"], errors='coerce')
df_ppi_by_nace_agg.dropna(subset=["year"], inplace=True) # Drop rows where year could not be converted
df_ppi_by_nace_agg["year"] = df_ppi_by_nace_agg["year"].astype(int)
df_ppi_by_nace_agg["value"] = pd.to_numeric(df_ppi_by_nace_agg["value"], errors='coerce')

# Rebase specific NACE codes from base year 2020 to 2015 
agri_codes_to_rebase = ["A", "014+015+017+031", "011+012+013"]

for code in agri_codes_to_rebase:
    current_code_mask = df_ppi_by_nace_agg['czso_code'] == code
    
    # Find the value in 2015 for this code (which is currently base 2020=100)
    value_2015_base2020_series = df_ppi_by_nace_agg.loc[current_code_mask & (df_ppi_by_nace_agg['year'] == 2015), 'value']
    
    if not value_2015_base2020_series.empty and pd.notna(value_2015_base2020_series.iloc[0]) and value_2015_base2020_series.iloc[0] != 0:
        base_value_for_rebase = value_2015_base2020_series.iloc[0]
        # Rebase: (current_value / value_at_new_base_in_old_series) * 100
        df_ppi_by_nace_agg.loc[current_code_mask, 'value'] = \
            (df_ppi_by_nace_agg.loc[current_code_mask, 'value'] / base_value_for_rebase) * 100
    else:
        # Handle cases where 2015 value is missing or zero if necessary, e.g., by setting values to NaN or logging a warning
        print(f"Warning: Could not rebase NACE code {code} due to missing or zero 2015 value.")
        df_ppi_by_nace_agg.loc[current_code_mask, 'value'] = pd.NA # Or keep as is, depending on desired behavior

# round to one decimal places
df_ppi_by_nace_agg["value"] = df_ppi_by_nace_agg["value"].round(1)

# Set unit for all to 2015=100 as all are now (or originally were) on this base
df_ppi_by_nace_agg["unit"] = "2015=100"

# Final data type conversions
df_ppi_by_nace_agg["czso_code"] = df_ppi_by_nace_agg["czso_code"].astype(str)
df_ppi_by_nace_agg["name_cs"] = df_ppi_by_nace_agg["name_cs"].astype(str)
df_ppi_by_nace_agg["name_en"] = df_ppi_by_nace_agg["name_en"].astype(str)
df_ppi_by_nace_agg["level"] = df_ppi_by_nace_agg["level"].astype(int)
# year and value already converted, ensure correct type
df_ppi_by_nace_agg["year"] = df_ppi_by_nace_agg["year"].astype(int)
df_ppi_by_nace_agg["value"] = df_ppi_by_nace_agg["value"].astype(float) 
df_ppi_by_nace_agg["metric"] = df_ppi_by_nace_agg["metric"].astype(str)
df_ppi_by_nace_agg["unit"] = df_ppi_by_nace_agg["unit"].astype(str)
df_ppi_by_nace_agg["source"] = df_ppi_by_nace_agg["source"].astype(str)

# Reorder columns to be consistent with other dataframes
col_order = ['czso_code', 'level', 'name_cs', 'name_en', 'year', 'metric', 'value', 'unit', 'source']
df_ppi_by_nace_agg = df_ppi_by_nace_agg[col_order]

# remove pre 2000 values
df_ppi_by_nace_agg = df_ppi_by_nace_agg[df_ppi_by_nace_agg["year"] >= 2000]

# remove industry ["Industrial producer", 0, "B+C+D+E"]
df_ppi_by_nace_agg = df_ppi_by_nace_agg[df_ppi_by_nace_agg["czso_code"] != "B+C+D+E"]
# remove "G+H+I+J+K+L+M+N-731" - this data adds unnecessary complexity compared to market services data 
df_ppi_by_nace_agg = df_ppi_by_nace_agg[df_ppi_by_nace_agg["czso_code"] != "G+H+I+J+K+L+M+N-731"]


### Merged PPI industry and PPI aggregated
source: CZSO
- yearly, base 2015
 

In [4]:
# check whether there is any overlap between df_ppi_by_nace_ind and df_ppi_by_nace_agg

print("--- Overlap Check: df_ppi_by_nace_ind vs df_ppi_by_nace_agg ---")

# Perform an inner merge on 'czso_code' and 'year'
# Suffixes are added to distinguish columns from the two dataframes if they have other same-named columns
overlap_df = pd.merge(
    df_ppi_by_nace_ind[['czso_code', 'year', 'value', 'metric']],
    df_ppi_by_nace_agg[['czso_code', 'year', 'value', 'metric']],
    on=['czso_code', 'year'],
    suffixes=('_ind', '_agg')
)

if not overlap_df.empty:
    print(f"Found {len(overlap_df)} overlapping records between industrial PPI and aggregated PPI.")
    print("Overlapping records (first 5):")
    print(overlap_df.head())
    
    print("\nUnique overlapping czso_codes:")
    print(overlap_df['czso_code'].unique())
else:
    print("No overlap found between df_ppi_by_nace_ind and df_ppi_by_nace_agg based on 'czso_code' and 'year'.")


--- Overlap Check: df_ppi_by_nace_ind vs df_ppi_by_nace_agg ---
No overlap found between df_ppi_by_nace_ind and df_ppi_by_nace_agg based on 'czso_code' and 'year'.


In [5]:
# create the unified dataframe for PPI by NACE
df_ppi_ind_for_combine = df_ppi_by_nace_ind.copy()
df_ppi_agg_for_combine = df_ppi_by_nace_agg.copy()

# Standardize the metric name to 'ppi_by_nace' for both dataframes
df_ppi_ind_for_combine['metric'] = "ppi_by_nace"
df_ppi_agg_for_combine['metric'] = "ppi_by_nace"

# Concatenate the two PPI dataframes
df_ppi_unified = pd.concat([df_ppi_ind_for_combine, df_ppi_agg_for_combine], ignore_index=True)

print(f"Created df_ppi_unified by combining df_ppi_by_nace_ind and df_ppi_by_nace_agg.")
print(f"Shape of df_ppi_unified: {df_ppi_unified.shape}")
print(f"Unique metrics in df_ppi_unified: {df_ppi_unified['metric'].unique()}")
print(f"Number of rows from industry PPI: {len(df_ppi_ind_for_combine)}")
print(f"Number of rows from aggregated PPI: {len(df_ppi_agg_for_combine)}")
print(f"Total rows in unified PPI: {len(df_ppi_unified)}")


Created df_ppi_unified by combining df_ppi_by_nace_ind and df_ppi_by_nace_agg.
Shape of df_ppi_unified: (3350, 9)
Unique metrics in df_ppi_unified: ['ppi_by_nace']
Number of rows from industry PPI: 2675
Number of rows from aggregated PPI: 675
Total rows in unified PPI: 3350


### Wages by NACE

source: CZSO
unit: Average gross monthly wage by activity of CZ-NACE
CZK, per full-time equivalent employee

- 2023 and 2024 preliminary data 

- we use "Q1 - Q4" data for the annual number 

-  The data refer only to the employees with an employment contract with the reporting units. Excludes persons performing public office, such as Members of Parliament, Senators, full-time councillors at all levels, judges, etc. The average wages refer to wages accounted for payment in the given period. 

- industry assigned level 0

In [6]:
# Define file paths
input_file = os.path.join(project_root, "data", "source_raw", "NACE", "pmzcr030625_2_wages by NACE.xlsx")

# Read the Excel file
df_wages = pd.read_excel(input_file, sheet_name="List1", header=4)

# remove first two rows 
df_wages = df_wages.iloc[2:]

# remove last three rows (notes)
df_wages = df_wages.iloc[:-3]

# rename columns: 
df_wages.rename(columns={"Unnamed: 0": "czso_code", "Unnamed: 1": "name"}, inplace=True)

# keep only columns czso_code, name_cs and those that start with "Q1-Q4"
df_wages = df_wages[["czso_code", "name"] + [col for col in df_wages.columns if col.startswith("Q1-Q4")]]
# rename the columns from Q1-Q4	Q1-Q4.1	Q1-Q4.2	Q1-Q4.3	Q1-Q4.4 to 2000, 2001, 2002, 2003, 2004, ...
df_wages.columns = ["czso_code", "name"] + [str(year) for year in range(2000, 2025)]

# in czso_code replace  value that STARTS with "B+C+D+E" only with "industry"
df_wages.loc[df_wages['czso_code'].str.startswith("B+C+D+E", na=False), 'czso_code'] = "industry"
# for industry fill in name
df_wages.loc[df_wages['czso_code'] == "industry", 'name'] = "Průmysl\nIndustry"

# separate name into name_cs and name_en (separated by \n)
df_wages[['name_cs', 'name_en']] = df_wages['name'].str.split('\n', expand=True)
# remove name column
df_wages.drop(columns=["name"], inplace=True)

# add level column: one letter: 1, industry = 0
df_wages["level"] = df_wages["czso_code"].apply(lambda x: 0 if x == "industry" else 1)

# transform to tidy format
df_wages = df_wages.melt(id_vars=["czso_code", "name_cs", "name_en", "level"], var_name="year", value_name="value")

# unit avg_gross_monthly_per_fulltime
df_wages["unit"] = "CZK_avg_gross_monthly_per_fulltime"
# metric wages_by_nace
df_wages["metric"] = "avg_wages_by_nace"

# data type conversion
df_wages["value"] = df_wages["value"].astype(float)
df_wages["year"] = df_wages["year"].astype(int)
df_wages["czso_code"] = df_wages["czso_code"].astype(str)
df_wages["level"] = df_wages["level"].astype(int)
df_wages["name_cs"] = df_wages["name_cs"].astype(str)
df_wages["name_en"] = df_wages["name_en"].astype(str)
df_wages["metric"] = df_wages["metric"].astype(str)
df_wages["unit"] = df_wages["unit"].astype(str)

# source
df_wages["source"] = "CZSO"




### avg number of employees by NACE

source: CZSO
- thousands of employees or full-time equivalents
- 2023 and 2024 preliminary data

- manipulation similar to wages 


In [7]:
# Define file paths
input_file = os.path.join(project_root, "data", "source_raw", "NACE", "pmzcr030625_3_csu avg number of employees by nace.xlsx")

# Read the Excel file
df_employees = pd.read_excel(input_file, sheet_name="List1", header=4)

# remove first two rows 
df_employees = df_employees.iloc[2:]

# remove last three rows (notes)
df_employees = df_employees.iloc[:-3]

# rename columns: 
df_employees.rename(columns={"Unnamed: 0": "czso_code", "Unnamed: 1": "name"}, inplace=True)

# keep only columns czso_code, name_cs and those that start with "Q1-Q4"
df_employees = df_employees[["czso_code", "name"] + [col for col in df_employees.columns if col.startswith("Q1-Q4")]]
# rename the columns from Q1-Q4	Q1-Q4.1	Q1-Q4.2	Q1-Q4.3	Q1-Q4.4 to 2000, 2001, 2002, 2003, 2004, ...
df_employees.columns = ["czso_code", "name"] + [str(year) for year in range(2000, 2025)]

# in czso_code replace  value that STARTS with "B+C+D+E" only with "industry"
df_employees.loc[df_employees['czso_code'].str.startswith("B+C+D+E", na=False), 'czso_code'] = "industry"
# for industry fill in name
df_employees.loc[df_employees['czso_code'] == "industry", 'name'] = "Průmysl\nIndustry"

# separate name into name_cs and name_en (separated by \n)
df_employees[['name_cs', 'name_en']] = df_employees['name'].str.split('\n', expand=True)
# remove name column
df_employees.drop(columns=["name"], inplace=True)

# add level column: one letter: 1, industry = 0
df_employees["level"] = df_employees["czso_code"].apply(lambda x: 0 if x == "industry" else 1)

# transform to tidy format
df_employees = df_employees.melt(id_vars=["czso_code", "name_cs", "name_en", "level"], var_name="year", value_name="value")

# unit
df_employees["unit"] = "ths"
# metric
df_employees["metric"] = "no_of_employees_by_nace"


# data type conversion
df_employees["value"] = df_employees["value"].astype(float)
df_employees["year"] = df_employees["year"].astype(int)
df_employees["czso_code"] = df_employees["czso_code"].astype(str)
df_employees["level"] = df_employees["level"].astype(int)
df_employees["name_cs"] = df_employees["name_cs"].astype(str)
df_employees["name_en"] = df_employees["name_en"].astype(str)
df_employees["metric"] = df_employees["metric"].astype(str)
df_employees["unit"] = df_employees["unit"].astype(str)

# source
df_employees["source"] = "CZSO"

In [8]:
# combine and save to parquet
df_combined = pd.concat([df_ppi_by_nace_ind, df_wages, df_employees, df_ppi_by_nace_agg, df_ppi_unified], ignore_index=True)
# change order
df_combined = df_combined[["czso_code", "level", "name_cs", "name_en", "year", "metric", "value", "unit", "source"]]


In [9]:
# save to parquet
output_folder = os.path.join(project_root, "data", "source_cleaned")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_file = os.path.join(output_folder, "data_by_nace_annual_tidy.parquet")

df_combined.to_parquet(output_file, index=False, engine="pyarrow")
print(f"Data saved to {output_file}")


Data saved to /Users/adam/Library/Mobile Documents/com~apple~CloudDocs/School/Master's Thesis/Analysis/profit-margins-inflation/data/source_cleaned/data_by_nace_annual_tidy.parquet
