In [10]:
import os
import pandas as pd
from datetime import timedelta

### Time-Weighted Annual CNB Repo Rates

**Data Source**  
A text file listing CNB repo rate decisions. Each decision is valid until the next one begins.

**Method**  
- **Time Weighting**: Instead of taking a simple average of all rates in a given year, we break each `[start_date, next_start)` interval by calendar year boundaries and compute a time‐weighted average. This ensures that a rate valid for a longer period has a proportionally bigger impact on the annual average.

**Justification**  
- **Accuracy**: Merely averaging rate values ignores how long each rate was in effect. Time weighting aligns the annual rate with real monetary policy conditions experienced throughout the year.
- **Usability**: The resulting annual series can be easily merged with other macro data (HICP, wages, etc.) for subsequent econometric analysis.


In [11]:
# Paths
script_dir = os.getcwd()  # Jupyter Notebook working directory
project_root = os.path.abspath(os.path.join(script_dir, ".."))
input_file = os.path.join(project_root, "data", "source_raw", "economy", "CNB_repo_sazby.txt")
output_folder = os.path.join(project_root, "data", "source_cleaned")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_file = os.path.join(output_folder, "cnb_repo_annual.parquet")

# Read data
df_repo = pd.read_csv(
    input_file,
    sep="|",
    names=["VALID_FROM", "CNB_REPO_RATE_IN_PCT"],
    header=None,
    dtype={"VALID_FROM": str}
)
df_repo["VALID_FROM"] = pd.to_datetime(df_repo["VALID_FROM"], format="%Y%m%d", errors="coerce")
df_repo["CNB_REPO_RATE_IN_PCT"] = pd.to_numeric(df_repo["CNB_REPO_RATE_IN_PCT"], errors="coerce")
df_repo.dropna(subset=["VALID_FROM"], inplace=True)

df_repo.sort_values("VALID_FROM", inplace=True)

# Next start date (rates are valid until the day before the next decision)
df_repo["NEXT_START"] = df_repo["VALID_FROM"].shift(-1)

# Define cutoff for the last row
if not df_repo.empty:
    last_idx = df_repo.index[-1]
    last_year = df_repo.loc[last_idx, "VALID_FROM"].year
    cutoff_date = pd.to_datetime(f"{last_year + 1}-12-31")
    df_repo.loc[last_idx, "NEXT_START"] = cutoff_date

def split_interval_by_year(start_date, end_date):
    """Break [start_date, end_date) into sub-intervals by calendar year."""
    end_date = end_date - pd.Timedelta(days=1)
    if end_date < start_date:
        return []
    intervals, current = [], start_date
    while current <= end_date:
        year_end = pd.to_datetime(f"{current.year}-12-31")
        local_end = min(year_end, end_date)
        delta = (local_end - current).days + 1
        intervals.append({"year": current.year, "days_in_interval": delta})
        current = local_end + pd.Timedelta(days=1)
    total_days = (end_date - start_date).days + 1
    for iv in intervals:
        iv["total_days_for_rate"] = total_days
    return intervals

records = []
for _, row in df_repo.iterrows():
    intervals = split_interval_by_year(row["VALID_FROM"], row["NEXT_START"])
    rate = row["CNB_REPO_RATE_IN_PCT"]
    for iv in intervals:
        frac = iv["days_in_interval"] / iv["total_days_for_rate"]
        records.append({
            "year": iv["year"],
            "repo_rate": rate,
            "weighted_rate": rate * frac
        })

df_intervals = pd.DataFrame(records)
if not df_intervals.empty:
    # Sum up weighted rates per year
    annual_data = df_intervals.groupby("year", as_index=False).agg(
        sum_weighted=("weighted_rate", "sum"),
        sum_rates=("repo_rate", "count")  # Not strictly needed, but can track intervals
    )
    # The sum of weighted_rate for each year is our time-weighted value,
    # because each interval's fraction sums to 1 if the year is fully covered.
    # However, to confirm partial coverage, let's also track fraction_of_interval:
    # (We'll just treat the sum of weighted_rate as the annual average directly.)
    
    # Alternatively, we can track fraction_of_interval if needed:
    # df_intervals["fraction_of_interval"] = df_intervals["weighted_rate"] / df_intervals["repo_rate"]
    # Then group and do the final average. For simplicity, let's proceed with the sum of weighted_rate:
    
    # Weighted average = sum(weighted_rate)
    # => We must ensure each year was fully covered for that to reflect a 100% fraction.
    # If partial coverage, the sum will be < the actual expected value.
    # For a robust approach, let's recalculate fraction_of_interval in the table:
    df_intervals["fraction_of_interval"] = df_intervals["weighted_rate"] / df_intervals["repo_rate"]
    annual_fracs = df_intervals.groupby("year")["fraction_of_interval"].sum().reset_index()
    annual_merged = annual_data.merge(annual_fracs, on="year", how="left")
    annual_merged.rename(columns={"fraction_of_interval": "sum_fraction"}, inplace=True)
    
    # Final annual rate = sum of weighted_rate / sum_fraction
    # sum_weighted is the sum of (rate * fraction_i)
    annual_merged["cnb_repo_rate_annual"] = (
        annual_merged["sum_weighted"] / annual_merged["sum_fraction"]
    )
    
    final_annual = annual_merged[["year", "cnb_repo_rate_annual"]].sort_values("year")
    # drop years pre 2000 and post 2024
    final_annual = final_annual[final_annual["year"] >= 2000]
    final_annual = final_annual[final_annual["year"] <= 2024]
    # Save to parquet
    #final_annual.to_parquet(output_file, engine="pyarrow", compression="snappy", index=False)
    #print(f"Time-weighted annual CNB repo rates saved to: {output_file}")
    #display(final_annual.head(25))
else:
    print("No records found after year 2000.")


### Annual HICP Data Preparation

**Data Source:**  
Monthly HICP data (Overall index) with columns:
- `DATE` (e.g., "1996-12-31")
- `TIME PERIOD`
- `"HICP - Overall index (ICP.M.CZ.N.000000.4.ANR)"`

**Method:**  
- **Parse Dates & Numeric Conversion:**  
  Convert the `DATE` column to datetime and the HICP index column to numeric.
- **Filter December Values:**  
  Select only records where the month is December. December data represents the end-of-year value, which is often used as the annual measure.
- **Extract Year:**  
  Create a `year` column from the December dates.
- **Output:**  
  The resulting DataFrame contains one observation per year with columns `year` and `hicp_dec`. This annual series will serve as the base for merging with other macroeconomic data.

**Justification:**  
Using December values provides a consistent, end-of-year snapshot. This harmonized annual format simplifies integration with datasets such as the CNB repo rates and firm-level financial data.


In [12]:
# Define file paths
script_dir = os.getcwd()  # Current directory in Jupyter
project_root = os.path.abspath(os.path.join(script_dir, ".."))
input_file = os.path.join(project_root, "data", "source_raw", "economy", "ECB Data Portal_20250402011223_HICP_from1996_CZ.csv")
output_folder = os.path.join(project_root, "data", "source_cleaned")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_file = os.path.join(output_folder, "hicp_december_annual.parquet")

# Read the HICP data
df_hicp = pd.read_csv(input_file)

# Convert DATE column to datetime
df_hicp["DATE"] = pd.to_datetime(df_hicp["DATE"], format="%Y-%m-%d", errors="coerce")

# Convert HICP index column to numeric
df_hicp["hicp_dec"] = pd.to_numeric(df_hicp["HICP - Overall index (ICP.M.CZ.N.000000.4.ANR)"], errors="coerce")

# Filter to keep only December observations
df_hicp_dec = df_hicp[df_hicp["DATE"].dt.month == 12].copy()

# Extract year from DATE
df_hicp_dec["year"] = df_hicp_dec["DATE"].dt.year

# Select only the required columns for annual data
df_hicp_annual = df_hicp_dec[["year", "hicp_dec"]].reset_index(drop=True)

# remove pre 2000 data
df_hicp_annual = df_hicp_annual[df_hicp_annual["year"] >= 2000]

# Save the annual HICP data to a Parquet file
df_hicp_annual.to_parquet(output_file, engine="pyarrow", compression="snappy", index=False)

print("Annual HICP (December values) saved to:", output_file)
#display(df_hicp_annual.head())


Annual HICP (December values) saved to: /Users/adam/Library/Mobile Documents/com~apple~CloudDocs/School/Master's Thesis/Analysis/profit-margins-inflation/data/source_cleaned/hicp_december_annual.parquet


### Annual Wages & Employees Data Preparation

**Data Source:**  
Excel file (`pmzcr030625_1_wages_avg.xlsx`) - CZSO reporting average gross monthly wages and average employee counts (in thousands) per full-time equivalent.

**Process:**
- **Select & Rename:**  
  Extract the year, nominal wage (CZK), and number of employees columns; rename them to `year`, `nom_gr_avg_wage_czk`, and `no_of_employees_ths`.
- **Filter Rows:**  
  Retain only the first 25 rows (annual values).
- **Clean Data:**  
  Correct irregular year entries (e.g., "20233)" → "2023"), convert `year` to integer, and ensure wage and employee columns are numeric.
  - values for 2023 and 2024 are preliminary
- **Output:**  
  Save the cleaned annual data as a Parquet.

**Rationale:**  
Focusing on annual data and standardizing types streamlines merging with other macro series (e.g., CNB repo rates, HICP) and minimizes processing errors.

---

In [13]:
# Define file paths
script_dir = os.getcwd()  # Current directory in Jupyter
project_root = os.path.abspath(os.path.join(script_dir, ".."))
input_file = os.path.join(project_root, "data", "source_raw", "economy", "pmzcr030625_1_wages_avg.xlsx")
output_folder = os.path.join(project_root, "data", "source_cleaned")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_file = os.path.join(output_folder, "wages_no_employees_annual.parquet")

# Read the Excel file
df_wages = pd.read_excel(input_file, sheet_name="List1", skiprows=4)

# select only the relevant columns: 0 (year), 1 - nominal wage CZK, 4 - no of employees in thousands
# and rename them
df_wages = df_wages.iloc[:, [0, 1, 4]].rename(columns={
    df_wages.columns[0]: "year",
    df_wages.columns[1]: "nom_gr_avg_wage_czk",
    df_wages.columns[4]: "no_of_employees_ths"
}) 

# select only first 25 rows (yearly values)
df_wages = df_wages.iloc[1:26].copy()
# Convert the year column to int
#df_wages["year"] = df_wages["year"].astype(int, errors="coerce")

# replace 20233) and 20243) by 2023 and 2024 in year col (this was a reference)
df_wages["year"] = df_wages["year"].replace({"20233)": "2023", "20243)": "2024"})

# Convert the year column to int
df_wages["year"] = df_wages["year"].astype(int, errors="ignore")
# Convert the nominal wage column to numeric
df_wages["nom_gr_avg_wage_czk"] = pd.to_numeric(df_wages["nom_gr_avg_wage_czk"], errors="coerce")
# # Convert the number of employees column to numeric
df_wages["no_of_employees_ths"] = pd.to_numeric(df_wages["no_of_employees_ths"], errors="coerce")

# save to parquet
#df_wages.to_parquet(output_file, engine="pyarrow", compression="snappy", index=False)
#print("Annual wages data and no. of employees saved to:", output_file)


### GDP Data Preparation

**Data Source:** CZSO

Hruby domaci produkt - stale ceny z r 2020

Kód: NUCDUSHV01-R/11
Výpočet ukazatelů ve stálých cenách roku 2020 byl proveden metodou řetězení meziročních indexů (vypočtených vždy na bázi průměru předchozího roku). Tato metoda zajišťuje vazbu ukazatelů ve čtvrtletní a roční periodicitě, ale neumožňuje vazbu jednotlivých komponent na úhrn.
SOPR – stejné období předchozího roku.
odhad podle sumy kvart. hodnot

 **"Stálé ceny roku 2020"** column. This column represents GDP in constant prices (with 2020 as the base year), allowing you to compare real economic growth over time without the distortions caused by price changes.

**Why "Stálé ceny roku 2020" is preferable:**

- **Removes Inflation Effects:**  
measure the true volume of economic activity rather than nominal changes that include inflation.

- **Consistent Time Comparison:**  
  It enables a more accurate comparison of economic performance across different years, as all values are adjusted to the price level of the base year (2020).

- **Policy Analysis Relevance:**  
  For studies that examine relationships between variables like profit margins and inflation, it’s crucial to work with real output measures. This helps in isolating the effect of inflation from other factors.


In [14]:
# Define file paths
script_dir = os.getcwd()  # Current directory in Jupyter
project_root = os.path.abspath(os.path.join(script_dir, ".."))
input_file = os.path.join(project_root, "data", "source_raw", "economy", "NUCDUSHV01-R_CZSO_GDP.xlsx")
output_folder = os.path.join(project_root, "data", "source_cleaned")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_file = os.path.join(output_folder, "gdp_annual.parquet")

# Read the Excel file
df_gdp = pd.read_excel(input_file, sheet_name="DATA", skiprows=6)

# Select only the relevant columns: 1 (year), 2 - GDP nominal prices, 4 - GDP 2020 base prices, 5 - GDP 2020 base prices SOPR (stejné období předchozího roku), 6 - deflator nominal, 7 - deflator base 2020
# and rename them
df_gdp = df_gdp.iloc[:, [1, 2, 4, 5, 6, 7]].rename(columns={
    df_gdp.columns[1]: "year",
    df_gdp.columns[2]: "gdp_nominal_prices",
    df_gdp.columns[4]: "gdp_2020_base_prices",
    df_gdp.columns[5]: "gdp_2020_base_prices_sopr",
    df_gdp.columns[6]: "deflator_nominal",
    df_gdp.columns[7]: "deflator_base_2020"
})

# Select only first 24 rows (yearly values)
df_gdp = df_gdp.iloc[0:25].copy()

# adjust "2024 [3]" - keep first 4 characters only 
df_gdp["year"] = df_gdp["year"].str[:4]

# Convert the year column to int
df_gdp["year"] = df_gdp["year"].astype(int, errors="ignore")

# Convert the GDP columns to numeric
df_gdp["gdp_nominal_prices"] = pd.to_numeric(df_gdp["gdp_nominal_prices"], errors="coerce")
df_gdp["gdp_2020_base_prices"] = pd.to_numeric(df_gdp["gdp_2020_base_prices"], errors="coerce")
df_gdp["gdp_2020_base_prices_sopr"] = pd.to_numeric(df_gdp["gdp_2020_base_prices_sopr"], errors="coerce")
df_gdp["deflator_nominal"] = pd.to_numeric(df_gdp["deflator_nominal"], errors="coerce")
df_gdp["deflator_base_2020"] = pd.to_numeric(df_gdp["deflator_base_2020"], errors="coerce")

# Save to parquet

#df_gdp.to_parquet(output_file, engine="pyarrow", compression="snappy", index=False)
#print("Annual GDP data saved to:", output_file)
#display(df_gdp.head())




  warn("Workbook contains no default style, apply openpyxl's default")


### Unemployment rate Data Preparation

source: CZSO 
Obecná míra nezaměstnanosti - Ceska republika

chosen based on data availability from 2000



In [15]:
# Define file paths
script_dir = os.getcwd()  # Current directory in Jupyter
project_root = os.path.abspath(os.path.join(script_dir, ".."))
input_file = os.path.join(project_root, "data", "source_raw", "economy", "ZAMDPORK02_unemployment.xlsx")
output_folder = os.path.join(project_root, "data", "source_cleaned")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_file = os.path.join(output_folder, "unemp_rate_annual.parquet")

# Read the Excel file
df_unemp = pd.read_excel(input_file, sheet_name="DATA", skiprows=6)

# drop first column 
df_unemp.drop(df_unemp.columns[0], axis=1, inplace=True)

# drop all rows except the first 
df_unemp = df_unemp.iloc[0:1].copy()

# the data is in wide format, so we need to transpose it
df_unemp = df_unemp.transpose()

# drop Czech republic row 1st
df_unemp.drop(df_unemp.index[0], inplace=True)

# rename columns: from index to year
df_unemp.reset_index(inplace=True)
df_unemp.rename(columns={"index": "year"}, inplace=True)
# Convert the year column to int
df_unemp["year"] = df_unemp["year"].astype(int, errors="ignore")

# rename column: 0 to unemp_rate
df_unemp.rename(columns={df_unemp.columns[1]: "unemp_rate"}, inplace=True)

# Convert the unemployment rate column to numeric
df_unemp["unemp_rate"] = pd.to_numeric(df_unemp["unemp_rate"], errors="coerce")

# save to parquet
# df_unemp.to_parquet(output_file, engine="pyarrow", compression="snappy", index=False)
# print("Annual unemployment rate data saved to:", output_file)
#display(df_unemp.head())


  warn("Workbook contains no default style, apply openpyxl's default")


### FX rates 

source: CNB (https://www.cnb.cz/en/financial-markets/foreign-exchange-market/central-bank-exchange-rate-fixing/central-bank-exchange-rate-fixing/currency_average.html?currency=EUR)

- **Data Source:** CNB
- **Data Type:** Average Qty FX rates

- **Data Format:** txt

we calculate an annual FX rate by taking the simple average of the four quarterly values, since each quarter is weighted equally (3 months). This arithmetic average provides a reasonable approximation of the yearly exchange rate.



In [16]:
# Define file paths
script_dir = os.getcwd()  # Current directory in Jupyter
project_root = os.path.abspath(os.path.join(script_dir, ".."))
input_file = os.path.join(project_root, "data", "source_raw", "economy", "CNB FX rates from 1999.txt")
output_folder = os.path.join(project_root, "data", "source_cleaned")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_file = os.path.join(output_folder, "fx_rates_annual.parquet")

# Read the FX rates data
df_fx = pd.read_csv(
    input_file,
    sep="|",
)

# convert year to integer
df_fx["year"] = df_fx["year"].astype(int, errors="ignore")

# convert all columns to numeric
df_fx.iloc[:, 1:] = df_fx.iloc[:, 1:].apply(pd.to_numeric, errors="coerce")

# calculate mean for each row 
# and add it to the dataframe
df_fx["fx_czk_eur_annual_avg"] = df_fx.iloc[:, 1:].mean(axis=1)

#drop the original columns
df_fx.drop(df_fx.columns[1:5], axis=1, inplace=True)

# save to parquet
# df_fx.to_parquet(output_file, engine="pyarrow", compression="snappy", index=False)
# print("Annual FX rates data saved to:", output_file)


### Import price indices w/o energy
**Data Source:** CZSO
- **Data Type:** Import price indices excluding energy
- see src_01_data_curation/data_curation_import_prices.ipynb for details 
Data: 
```csv
Year,Index
2008,91.87
2009,92.44
2010,92.05
2011,92.34
2012,94.5
2013,95.46
2014,98.56
2015,100.0
2016,97.99
2017,97.32
2018,95.26
2019,96.2
2020,97.48
2021,99.0
2022,105.88
2023,104.64
2024,107.73
```


In [24]:
# file: data/source_cleaned/import_price_index_ex_energy.csv
# read the import price index data
script_dir = os.getcwd()  # Current directory in Jupyter
project_root = os.path.abspath(os.path.join(script_dir, ".."))
input_file = os.path.join(project_root, "data", "source_cleaned", "import_price_index_ex_energy.csv")

# Read the import price index data
df_import_price_index = pd.read_csv(input_file)



# rename Year to year, to integer
df_import_price_index.rename(columns={"Year": "year"}, inplace=True)
df_import_price_index["year"] = df_import_price_index["year"].astype(int, errors="ignore")
# rename index column to value, to float
df_import_price_index.rename(columns={"Index": "value"}, inplace=True)
df_import_price_index["value"] = df_import_price_index["value"].astype(float, errors="ignore")

# add metric column with value "import_price_index_ex_energy"
df_import_price_index["metric"] = "import_price_index_ex_energy"





In [25]:
df_import_price_index

Unnamed: 0,year,value,metric
0,2008,91.87,import_price_index_ex_energy
1,2009,92.44,import_price_index_ex_energy
2,2010,92.05,import_price_index_ex_energy
3,2011,92.34,import_price_index_ex_energy
4,2012,94.5,import_price_index_ex_energy
5,2013,95.46,import_price_index_ex_energy
6,2014,98.56,import_price_index_ex_energy
7,2015,100.0,import_price_index_ex_energy
8,2016,97.99,import_price_index_ex_energy
9,2017,97.32,import_price_index_ex_energy


### merge data



In [26]:
# merge data: final_annual, df_hicp_annual, df_wages, df_gdp, df_unemp, df_fx on "year"
df_final = final_annual.merge(df_hicp_annual, on="year", how="left")
df_final = df_final.merge(df_wages, on="year", how="left")
df_final = df_final.merge(df_gdp, on="year", how="left")
df_final = df_final.merge(df_unemp, on="year", how="left")
df_final = df_final.merge(df_fx, on="year", how="left")


# convert to tidy data
df_final = pd.melt(df_final, id_vars=["year"], var_name="metric", value_name="value")

# append the import price index data
df_import_price_index = df_import_price_index[["year", "metric", "value"]]
df_final = pd.concat([df_final, df_import_price_index], ignore_index=True)
                                              

# save to parquet
output_file = os.path.join(output_folder, "economy_annual_tidy.parquet")
df_final.to_parquet(output_file, engine="pyarrow", compression="snappy", index=False)
print("Annual economy data saved to:", output_file)

Annual economy data saved to: /Users/adam/Library/Mobile Documents/com~apple~CloudDocs/School/Master's Thesis/Analysis/profit-margins-inflation/data/source_cleaned/economy_annual_tidy.parquet
