In [1]:
import os
import pandas as pd

### Time-Weighted Annual CNB Repo Rates

**Data Source**  
A text file listing CNB repo rate decisions. Each decision is valid until the next one begins.

**Method**  
- **Time Weighting**: Instead of taking a simple average of all rates in a given year, we break each `[start_date, next_start)` interval by calendar year boundaries and compute a time‐weighted average. This ensures that a rate valid for a longer period has a proportionally bigger impact on the annual average.

**Justification**  
- **Accuracy**: Merely averaging rate values ignores how long each rate was in effect. Time weighting aligns the annual rate with real monetary policy conditions experienced throughout the year.
- **Usability**: The resulting annual series can be easily merged with other macro data (HICP, wages, etc.) for subsequent econometric analysis.


In [23]:
import os
import pandas as pd
from datetime import timedelta

# Paths
script_dir = os.getcwd()  # Jupyter Notebook working directory
project_root = os.path.abspath(os.path.join(script_dir, ".."))
input_file = os.path.join(project_root, "data", "source_raw", "economy", "CNB_repo_sazby.txt")
output_folder = os.path.join(project_root, "data", "source_cleaned")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_file = os.path.join(output_folder, "cnb_repo_annual.parquet")

# Read data
df_repo = pd.read_csv(
    input_file,
    sep="|",
    names=["VALID_FROM", "CNB_REPO_RATE_IN_PCT"],
    header=None,
    dtype={"VALID_FROM": str}
)
df_repo["VALID_FROM"] = pd.to_datetime(df_repo["VALID_FROM"], format="%Y%m%d", errors="coerce")
df_repo["CNB_REPO_RATE_IN_PCT"] = pd.to_numeric(df_repo["CNB_REPO_RATE_IN_PCT"], errors="coerce")
df_repo.dropna(subset=["VALID_FROM"], inplace=True)

df_repo.sort_values("VALID_FROM", inplace=True)

# Next start date (rates are valid until the day before the next decision)
df_repo["NEXT_START"] = df_repo["VALID_FROM"].shift(-1)

# Define cutoff for the last row
if not df_repo.empty:
    last_idx = df_repo.index[-1]
    last_year = df_repo.loc[last_idx, "VALID_FROM"].year
    cutoff_date = pd.to_datetime(f"{last_year + 1}-12-31")
    df_repo.loc[last_idx, "NEXT_START"] = cutoff_date

def split_interval_by_year(start_date, end_date):
    """Break [start_date, end_date) into sub-intervals by calendar year."""
    end_date = end_date - pd.Timedelta(days=1)
    if end_date < start_date:
        return []
    intervals, current = [], start_date
    while current <= end_date:
        year_end = pd.to_datetime(f"{current.year}-12-31")
        local_end = min(year_end, end_date)
        delta = (local_end - current).days + 1
        intervals.append({"year": current.year, "days_in_interval": delta})
        current = local_end + pd.Timedelta(days=1)
    total_days = (end_date - start_date).days + 1
    for iv in intervals:
        iv["total_days_for_rate"] = total_days
    return intervals

records = []
for _, row in df_repo.iterrows():
    intervals = split_interval_by_year(row["VALID_FROM"], row["NEXT_START"])
    rate = row["CNB_REPO_RATE_IN_PCT"]
    for iv in intervals:
        frac = iv["days_in_interval"] / iv["total_days_for_rate"]
        records.append({
            "year": iv["year"],
            "repo_rate": rate,
            "weighted_rate": rate * frac
        })

df_intervals = pd.DataFrame(records)
if not df_intervals.empty:
    # Sum up weighted rates per year
    annual_data = df_intervals.groupby("year", as_index=False).agg(
        sum_weighted=("weighted_rate", "sum"),
        sum_rates=("repo_rate", "count")  # Not strictly needed, but can track intervals
    )
    # The sum of weighted_rate for each year is our time-weighted value,
    # because each interval's fraction sums to 1 if the year is fully covered.
    # However, to confirm partial coverage, let's also track fraction_of_interval:
    # (We'll just treat the sum of weighted_rate as the annual average directly.)
    
    # Alternatively, we can track fraction_of_interval if needed:
    # df_intervals["fraction_of_interval"] = df_intervals["weighted_rate"] / df_intervals["repo_rate"]
    # Then group and do the final average. For simplicity, let's proceed with the sum of weighted_rate:
    
    # Weighted average = sum(weighted_rate)
    # => We must ensure each year was fully covered for that to reflect a 100% fraction.
    # If partial coverage, the sum will be < the actual expected value.
    # For a robust approach, let's recalculate fraction_of_interval in the table:
    df_intervals["fraction_of_interval"] = df_intervals["weighted_rate"] / df_intervals["repo_rate"]
    annual_fracs = df_intervals.groupby("year")["fraction_of_interval"].sum().reset_index()
    annual_merged = annual_data.merge(annual_fracs, on="year", how="left")
    annual_merged.rename(columns={"fraction_of_interval": "sum_fraction"}, inplace=True)
    
    # Final annual rate = sum of weighted_rate / sum_fraction
    # sum_weighted is the sum of (rate * fraction_i)
    annual_merged["cnb_repo_rate_annual"] = (
        annual_merged["sum_weighted"] / annual_merged["sum_fraction"]
    )
    
    final_annual = annual_merged[["year", "cnb_repo_rate_annual"]].sort_values("year")
    # drop years pre 2000
    final_annual = final_annual[final_annual["year"] >= 2000]
    # Save to parquet
    final_annual.to_parquet(output_file, engine="pyarrow", compression="snappy", index=False)
    print(f"Time-weighted annual CNB repo rates saved to: {output_file}")
    #display(final_annual.head(25))
else:
    print("No records found after year 2000.")


Time-weighted annual CNB repo rates saved to: /Users/adam/Library/Mobile Documents/com~apple~CloudDocs/School/Master's Thesis/Analysis/profit-margins-inflation/data/source_cleaned/cnb_repo_annual.parquet


### Annual HICP Data Preparation

**Data Source:**  
Monthly HICP data (Overall index) with columns:
- `DATE` (e.g., "1996-12-31")
- `TIME PERIOD`
- `"HICP - Overall index (ICP.M.CZ.N.000000.4.ANR)"`

**Method:**  
- **Parse Dates & Numeric Conversion:**  
  Convert the `DATE` column to datetime and the HICP index column to numeric.
- **Filter December Values:**  
  Select only records where the month is December. December data represents the end-of-year value, which is often used as the annual measure.
- **Extract Year:**  
  Create a `year` column from the December dates.
- **Output:**  
  The resulting DataFrame contains one observation per year with columns `year` and `hicp_dec`. This annual series will serve as the base for merging with other macroeconomic data.

**Justification:**  
Using December values provides a consistent, end-of-year snapshot. This harmonized annual format simplifies integration with datasets such as the CNB repo rates and firm-level financial data.


In [34]:
# Define file paths
script_dir = os.getcwd()  # Current directory in Jupyter
project_root = os.path.abspath(os.path.join(script_dir, ".."))
input_file = os.path.join(project_root, "data", "source_raw", "economy", "ECB Data Portal_20250402011223_HICP_from1996_CZ.csv")
output_folder = os.path.join(project_root, "data", "source_cleaned")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
output_file = os.path.join(output_folder, "hicp_december_annual.parquet")

# Read the HICP data
df_hicp = pd.read_csv(input_file)

# Convert DATE column to datetime
df_hicp["DATE"] = pd.to_datetime(df_hicp["DATE"], format="%Y-%m-%d", errors="coerce")

# Convert HICP index column to numeric
df_hicp["hicp_dec"] = pd.to_numeric(df_hicp["HICP - Overall index (ICP.M.CZ.N.000000.4.ANR)"], errors="coerce")

# Filter to keep only December observations
df_hicp_dec = df_hicp[df_hicp["DATE"].dt.month == 12].copy()

# Extract year from DATE
df_hicp_dec["year"] = df_hicp_dec["DATE"].dt.year

# Select only the required columns for annual data
df_hicp_annual = df_hicp_dec[["year", "hicp_dec"]].reset_index(drop=True)

# remove pre 2000 data
df_hicp_annual = df_hicp_annual[df_hicp_annual["year"] >= 2000]

# Save the annual HICP data to a Parquet file
df_hicp_annual.to_parquet(output_file, engine="pyarrow", compression="snappy", index=False)

print("Annual HICP (December values) saved to:", output_file)
#display(df_hicp_annual.head())


Annual HICP (December values) saved to: /Users/adam/Library/Mobile Documents/com~apple~CloudDocs/School/Master's Thesis/Analysis/profit-margins-inflation/data/source_cleaned/hicp_december_annual.parquet
