In [1]:
# ------------------------------------------------------------
# 0. Paths
# ------------------------------------------------------------
from pathlib import Path
import pandas as pd
import numpy as np
import os

project_root  = Path(os.getcwd()).parent
clean_dir     = project_root / "data" / "source_cleaned"
in_file       = clean_dir / "magnusweb_tidy.parquet"
out_file      = clean_dir / "magnusweb_tidy_enriched.parquet"


In [2]:
# ------------------------------------------------------------
# 1. Load tidy long data
# ------------------------------------------------------------
df = pd.read_parquet(in_file, engine="pyarrow")


In [3]:
# ------------------------------------------------------------
# 2. Keep only rows needed for the margin maths
# ------------------------------------------------------------
want = df["metric"].isin(["oper_profit", "profit_net", "sales_revenue"])
mini = df.loc[want, ["ico", "year", "quarter", "metric", "value"]].copy()

In [4]:
# ------------------------------------------------------------
# 3. Pivot *only* on the essential keys
#    observed=True prevents pandas from generating the full product
# ------------------------------------------------------------
wide = (
    mini.pivot_table(
        index=["ico", "year", "quarter"],
        columns="metric",
        values="value",
        aggfunc="first",
        observed=True      # this is the crucial switch
    )
    .reset_index()
)


In [5]:
# ------------------------------------------------------------
# 4. Compute margins
# ------------------------------------------------------------
wide["operating_margin"] = np.where(
    wide["sales_revenue"] > 0,
    wide["oper_profit"] / wide["sales_revenue"],
    np.nan
)
wide["net_margin"] = np.where(
    wide["sales_revenue"] > 0,
    wide["profit_net"] / wide["sales_revenue"],
    np.nan
)

In [6]:
wide

metric,ico,year,quarter,oper_profit,profit_net,sales_revenue,operating_margin,net_margin
0,101435,2001,4,,11270.0,28930280.0,,0.000390
1,101435,2002,4,,20470.0,32669360.0,,0.000627
2,101435,2003,4,,185630.0,46378490.0,,0.004003
3,101435,2004,4,,297360.0,55807400.0,,0.005328
4,101435,2005,4,,266500.0,78691860.0,,0.003387
...,...,...,...,...,...,...,...,...
84827,9990739,2022,4,48528000.0,29906000.0,988637000.0,0.049086,0.030250
84828,9990739,2023,4,43603000.0,28765000.0,889042000.0,0.049045,0.032355
84829,9997016,2021,4,-6113000.0,-8355000.0,46905000.0,-0.130327,-0.178126
84830,9997016,2022,4,27676000.0,16808000.0,674168000.0,0.041052,0.024931


In [7]:
# ------------------------------------------------------------
# 5. Melt margins back to long
# ------------------------------------------------------------
margins_long = (
    wide[["ico", "year", "quarter", "operating_margin", "net_margin"]]
      .melt(id_vars=["ico", "year", "quarter"],
            var_name="metric",
            value_name="value")
      .dropna(subset=["value"])
)


In [None]:
# ------------------------------------------------------------
# 6. Re-attach static firm descriptors
#    (grab one row per ico from the original DF and left-join)
# ------------------------------------------------------------
static_cols = [
    "ico", "name", "main_nace", "main_nace_code", "sub_nace_cz",
    "sub_nace_cz_code", "main_okec", "main_okec_code",
    "sub_okec", "sub_okec_code", "esa2010", "esa95",
    "locality", "region", "num_employees", "turnover_cat",
    "audit", "consolidation", "currency", "date_founded", "date_dissolved"
]

static = (
    df.loc[:, static_cols]
      .drop_duplicates(subset=["ico"])
      .set_index("ico")
)

margins_long = margins_long.join(static, on="ico")

# Ensure categorical dtypes match original DF wherever columns overlap
for col in static_cols:
    if col in df.columns and str(df[col].dtype).startswith("category"):
        margins_long[col] = margins_long[col].astype("category")

In [None]:
# ------------------------------------------------------------
# 7. Concatenate with original long data
# ------------------------------------------------------------
df_out = pd.concat([df, margins_long], ignore_index=True)

In [10]:
# ------------------------------------------------------------
# 8. Append CZSO sector code (czso_code) via NACE mapping
# ------------------------------------------------------------
# 8.1 Load the mapping table (contains magnus_nace ↔ czso_code)
nace_map_path = clean_dir / "t_nace_matching.parquet"   # adjust if located elsewhere
nace_map = (
    pd.read_parquet(nace_map_path, engine="pyarrow")
      .loc[:, ["magnus_nace", "czso_code", "level", "name_czso_en"]]              # keep only what we need
      .drop_duplicates()
)

# remove rows with empty magnus_nace or containing empty string (= levels above 4)
nace_map = nace_map[nace_map["magnus_nace"].notna()]
nace_map = nace_map[nace_map["magnus_nace"] != ""]

# keep only the row with the lowest level for each combination of magnus_nace and name_czso_en
nace_map = nace_map.loc[nace_map.groupby(["magnus_nace", "name_czso_en"])["level"].idxmin()]


# 8.2 Ensure key columns are comparable strings
df_out["main_nace_code"] = df_out["main_nace_code"].astype(str)
nace_map["magnus_nace"]   = nace_map["magnus_nace"].astype(str)

# 8.3 Merge: main_nace_code (Magnus-style) → czso_code
df_out = (
    df_out
      .merge(
          nace_map.rename(columns={"magnus_nace": "main_nace_code"}),
          on="main_nace_code",
          how="left",
          validate="m:1"        # each firm row maps to ≤1 czso_code
      )
)

# 8.4 Quick sanity check
matched   = df_out["czso_code"].notna().mean()
print(f"✅  CZSO code attached to {matched:.1%} of rows")


✅  CZSO code attached to 100.0% of rows


In [None]:
# df_out.to_parquet(out_file, engine="pyarrow", compression="snappy")

# print("✅  Saved:", out_file)
# print("    Added metrics:", margins_long['metric'].unique().tolist())
# print("    Total rows:", len(df_out))