In [75]:
import matplotlib.pyplot as plt
import numpy as np
import polars as pl
import os

# Load the data using lazy evaluation for better performance
main_path = os.path.join("..", "data", "source_cleaned", "magnusweb_panel_imputed.parquet")
#main_path = os.path.join("..", "data", "source_cleaned", "magnusweb_panel_hq.parquet")

# Output path for the final merged dataset
output_path = os.path.join("..", "data", "data_ready", "merged_panel_imputed.parquet")
#output_path = os.path.join("..", "data", "data_ready", "merged_panel_hq.parquet")

# Using scan_parquet for lazy loading
main_df = pl.scan_parquet(main_path)
#hq_df = pl.scan_parquet(hq_path)

nace_propagated_path = os.path.join("..", "data", "source_cleaned", "data_by_nace_annual_tidy_propagated.parquet")
nace_propagated_df = pl.scan_parquet(nace_propagated_path)

macro_indicators_path = os.path.join("..", "data", "source_cleaned", "economy_annual_tidy.parquet")
macro_indicators_df = pl.scan_parquet(macro_indicators_path)

print("Data loaded using lazy evaluation")    


Data loaded using lazy evaluation


In [76]:
# Explore the structure of each dataset
print("=== Main DataFrame Structure ===")
main_sample = main_df.limit(0).collect()  # Get structure without data
print(f"Main columns: {main_sample.columns}")

print("\n=== NACE Propagated DataFrame Structure ===")
nace_sample = nace_propagated_df.limit(5).collect()
print(f"NACE columns: {nace_sample.columns}")
print(f"NACE sample shape: {nace_sample.shape}")
print(nace_sample)

print("\n=== Macro Indicators DataFrame Structure ===")
macro_sample = macro_indicators_df.limit(5).collect()
print(f"Macro columns: {macro_sample.columns}")
print(f"Macro sample shape: {macro_sample.shape}")
print(macro_sample)

=== Main DataFrame Structure ===
Main columns: ['ico', 'year', 'other_liabilities', 'costs', 'sales_revenue', 'equity', 'profit_net', 'turnover', 'current_assets', 'oper_profit', 'total_liabilities', 'total_assets', 'total_liabilities_and_equity', 'profit_pre_tax', 'other_assets', 'fixed_assets', 'name', 'main_nace', 'main_nace_code', 'sub_nace_cz', 'sub_nace_cz_code', 'main_okec', 'main_okec_code', 'sub_okec', 'sub_okec_code', 'esa2010', 'esa95', 'locality', 'region', 'num_employees', 'num_employees_cat', 'turnover_cat', 'audit', 'consolidation', 'currency', 'date_founded', 'date_dissolved', 'status', 'legal_form', 'entity_type', 'year_founded', 'year_dissolved', 'is_dissolved', 'operating_margin_cal', 'net_margin_cal', 'roa_ebit_cal', 'roe_cal', 'equity_ratio_cal', 'cost_ratio_cal', 'asset_turnover_cal', 'labor_productivity_cal', 'effective_tax_rate_cal', 'level1_code', 'level2_code', 'name_czso_en', 'industry_flag']

=== NACE Propagated DataFrame Structure ===
NACE columns: ['czso_c

In [77]:
# Transform NACE data from long to wide format and add level-specific prefixes
print("=== Transforming NACE data ===")

# First, let's see what metrics we have in the NACE data
nace_metrics = nace_propagated_df.select("metric").unique().collect()
print(f"Available NACE metrics: {nace_metrics['metric'].to_list()}")

# Transform NACE Level 1 data from long to wide format
print("\n--- Processing Level 1 NACE data ---")
nace_level1 = nace_propagated_df.filter(pl.col("level") == 1).collect().pivot(
    index=["czso_code", "level", "year"],  # FIX: removed magnus_nace from index
    columns="metric", 
    values="value"
)

# Add sector_level1_ prefix to metric columns (all columns except the index ones)
index_cols = ["czso_code", "level", "year"]
nace_level1_renamed = nace_level1.rename({
    col: f"sector_level1_{col}" for col in nace_level1.columns 
    if col not in index_cols
})

# Add the English name column for Level 1 NACE
nace_level1_names = nace_propagated_df.filter(pl.col("level") == 1).select(
    ["czso_code", "name_en", "year"]
).unique().collect()

# Rename to proper suffix
nace_level1_names = nace_level1_names.rename({"name_en": "level1_nace_en_name"})

# Join the names with the level 1 data
nace_level1_renamed = pl.LazyFrame(nace_level1_renamed).join(
    pl.LazyFrame(nace_level1_names),
    left_on=["czso_code", "year"],
    right_on=["czso_code", "year"],
    how="left"
).unique(subset=["czso_code", "year"], keep="first")  # FIX: ensure uniqueness

print(f"Level 1 NACE data transformed: {nace_level1_renamed.collect_schema().names()}")

# Transform NACE Level 2 data from long to wide format
print("\n--- Processing Level 2 NACE data ---")
nace_level2 = nace_propagated_df.filter(pl.col("level") == 2).collect().pivot(
    index=["czso_code", "level", "year"],  # FIX: removed magnus_nace from index
    columns="metric", 
    values="value"
)

# Add sector_level2_ prefix to metric columns (all columns except the index ones)
nace_level2_renamed = nace_level2.rename({
    col: f"sector_level2_{col}" for col in nace_level2.columns 
    if col not in index_cols
})

# Add the English name column for Level 2 NACE
nace_level2_names = nace_propagated_df.filter(pl.col("level") == 2).select(
    ["czso_code", "name_en", "year"]
).unique().collect()

# Rename to proper suffix
nace_level2_names = nace_level2_names.rename({"name_en": "level2_nace_en_name"})

# Join the names with the level 2 data
nace_level2_renamed = pl.LazyFrame(nace_level2_renamed).join(
    pl.LazyFrame(nace_level2_names),
    left_on=["czso_code", "year"],
    right_on=["czso_code", "year"],
    how="left"
).unique(subset=["czso_code", "year"], keep="first")  # FIX: ensure uniqueness

print(f"Level 2 NACE data transformed: {nace_level2_renamed.collect_schema().names()}")

print("\nNACE data transformed to wide format with level-specific prefixes")

=== Transforming NACE data ===
Available NACE metrics: ['ppi_by_nace', 'avg_wages_by_nace', 'no_of_employees_by_nace']

--- Processing Level 1 NACE data ---
Level 1 NACE data transformed: ['czso_code', 'level', 'year', 'sector_level1_avg_wages_by_nace', 'sector_level1_no_of_employees_by_nace', 'sector_level1_ppi_by_nace', 'level1_nace_en_name']

--- Processing Level 2 NACE data ---
Level 2 NACE data transformed: ['czso_code', 'level', 'year', 'sector_level2_ppi_by_nace', 'sector_level2_avg_wages_by_nace', 'sector_level2_no_of_employees_by_nace', 'level2_nace_en_name']

NACE data transformed to wide format with level-specific prefixes


  nace_level1 = nace_propagated_df.filter(pl.col("level") == 1).collect().pivot(
  nace_level2 = nace_propagated_df.filter(pl.col("level") == 2).collect().pivot(


In [78]:
# Transform macro data from long to wide format and add mac_ prefix
print("=== Transforming Macro data ===")

# First, let's see what metrics we have in the macro data
macro_metrics = macro_indicators_df.select("metric").unique().collect()
print(f"Available macro metrics: {macro_metrics['metric'].to_list()}")

# Transform macro data from long to wide format
# For lazy frames, we need to collect first
macro_wide = macro_indicators_df.collect().pivot(
    index=["year"],
    columns="metric",
    values="value"
)

# Add mac_ prefix to metric columns (all columns except year)
macro_renamed = macro_wide.rename({
    col: f"mac_{col}" for col in macro_wide.columns 
    if col != "year"
})

# Convert back to lazy frame for efficient joining
macro_renamed = pl.LazyFrame(macro_renamed)

print("Macro data transformed to wide format with mac_ prefix")
print(f"Transformed columns: {macro_renamed.collect_schema().names()}")

=== Transforming Macro data ===
Available macro metrics: ['gdp_2020_base_prices_sopr', 'gdp_2020_base_prices', 'GAP', 'deflator_base_2020', 'UNR', 'nom_gr_avg_wage_czk', 'ITV_ANNPCT', 'ULCDR', 'EXCHEB', 'hicp_pure_energy_roc', 'RPMGS', 'gdp_nominal_prices', 'FBGSQ', 'fx_czk_eur_annual_avg', 'KTPV_ANNPCT', 'IRL', 'CPI_YTYPCT', 'unemp_rate', 'IRS', 'EXCH', 'HRS', 'GGFLMQ', 'hicp_energy_full_roc', 'cnb_repo_rate_annual', 'NLGXQ', 'NOOQ', 'deflator_nominal', 'MPEN', 'CPV_ANNPCT', 'import_price_index_ex_energy', 'PCORE_YTYPCT', 'TTRADE', 'hicp_overall_roc', 'PDTY', 'hicp_dec', 'no_of_employees_ths', 'ULC']
Macro data transformed to wide format with mac_ prefix
Transformed columns: ['year', 'mac_cnb_repo_rate_annual', 'mac_hicp_dec', 'mac_hicp_overall_roc', 'mac_hicp_pure_energy_roc', 'mac_hicp_energy_full_roc', 'mac_nom_gr_avg_wage_czk', 'mac_no_of_employees_ths', 'mac_gdp_nominal_prices', 'mac_gdp_2020_base_prices', 'mac_gdp_2020_base_prices_sopr', 'mac_deflator_nominal', 'mac_deflator_bas

  macro_wide = macro_indicators_df.collect().pivot(


In [79]:
# Add firm_ prefix to all columns in main_df (except year and join keys)
print("=== Adding firm_ prefix to firm-level columns ===")

# Get the columns from main_df that need the firm_ prefix
main_cols_sample = main_df.limit(0).collect().columns
print(f"Original main columns count: {len(main_cols_sample)}")

# Create rename mapping for all columns except 'year' and join keys
firm_rename_map = {}
for col in main_cols_sample:
    if col not in ['year', 'level1_code', 'level2_code']:  # Keep join keys unchanged
        firm_rename_map[col] = f"firm_{col}"

print(f"Renaming {len(firm_rename_map)} columns with firm_ prefix")

# Apply the renaming to main_df
main_df_renamed = main_df.rename(firm_rename_map)

print("Firm columns renamed successfully!")

# Step 1: Merge main_df with Level 1 NACE data
print("=== First Merge: Main + Level 1 NACE data ===")

# Check if we have the right join keys
main_cols_renamed = main_df_renamed.limit(0).collect().columns
print(f"Main df has level1_code: {'level1_code' in main_cols_renamed}")
print(f"Main df has level2_code: {'level2_code' in main_cols_renamed}")

# Perform the first merge with Level 1 NACE data (left join to keep all main data)
merged_step1 = main_df_renamed.join(
    nace_level1_renamed,
    left_on=["level1_code", "year"],
    right_on=["czso_code", "year"], 
    how="left"
)

# Verify the first merge
step1_sample = merged_step1.limit(3).collect()
print(f"After Level 1 NACE merge - Shape: {step1_sample.shape}")
print(f"Level 1 sector columns: {[col for col in step1_sample.columns if col.startswith('sector_level1_')][:5]}")

# Step 2: Merge with Level 2 NACE data
print("=== Second Merge: Adding Level 2 NACE data ===")

# Perform the second merge with Level 2 NACE data (left join to keep all existing data)
merged_step2 = merged_step1.join(
    nace_level2_renamed,
    left_on=["level2_code", "year"],
    right_on=["czso_code", "year"],
    how="left"
)

# Verify the second merge
step2_sample = merged_step2.limit(3).collect()
print(f"After Level 2 NACE merge - Shape: {step2_sample.shape}")
print(f"Level 2 sector columns: {[col for col in step2_sample.columns if col.startswith('sector_level2_')][:5]}")

# Check for any missing joins
null_check = merged_step2.select([
    pl.col("level1_code").is_null().sum().alias("null_level1_code"),
    pl.col("level2_code").is_null().sum().alias("null_level2_code"),
    pl.col("year").is_null().sum().alias("null_year")
]).collect()
print(f"Null check after NACE merges: {null_check}")

print("NACE merges completed successfully!")

=== Adding firm_ prefix to firm-level columns ===
Original main columns count: 56
Renaming 53 columns with firm_ prefix
Firm columns renamed successfully!
=== First Merge: Main + Level 1 NACE data ===
Main df has level1_code: True
Main df has level2_code: True


After Level 1 NACE merge - Shape: (3, 61)
Level 1 sector columns: ['sector_level1_avg_wages_by_nace', 'sector_level1_no_of_employees_by_nace', 'sector_level1_ppi_by_nace']
=== Second Merge: Adding Level 2 NACE data ===
After Level 2 NACE merge - Shape: (3, 66)
Level 2 sector columns: ['sector_level2_ppi_by_nace', 'sector_level2_avg_wages_by_nace', 'sector_level2_no_of_employees_by_nace']
Null check after NACE merges: shape: (1, 3)
┌──────────────────┬──────────────────┬───────────┐
│ null_level1_code ┆ null_level2_code ┆ null_year │
│ ---              ┆ ---              ┆ ---       │
│ u32              ┆ u32              ┆ u32       │
╞══════════════════╪══════════════════╪═══════════╡
│ 1268             ┆ 1268             ┆ 0         │
└──────────────────┴──────────────────┴───────────┘
NACE merges completed successfully!


In [80]:
# Step 3: Merge with macro data on year
print("=== Third Merge: Adding Macro data ===")

# Perform the third merge (left join to keep all existing data)
merged_final = merged_step2.join(
    macro_renamed,
    on="year",
    how="left"
)

# Collect the final result for verification
final_df = merged_final.collect()
print(f"Final merged data - Shape: {final_df.shape}")
print(f"Total columns: {len(final_df.columns)}")

# Categorize columns by prefix
firm_cols = [col for col in final_df.columns if not col.startswith(('sector_', 'mac_'))]
sector_level1_cols = [col for col in final_df.columns if col.startswith('sector_level1_')]
sector_level2_cols = [col for col in final_df.columns if col.startswith('sector_level2_')]
mac_cols = [col for col in final_df.columns if col.startswith('mac_')]

print(f"\nColumn breakdown:")
print(f"  - Firm columns: {len(firm_cols)}")
print(f"  - Sector Level 1 columns: {len(sector_level1_cols)}")
print(f"  - Sector Level 2 columns: {len(sector_level2_cols)}")
print(f"  - Macro columns: {len(mac_cols)}")

print(f"\nSample sector level 1 columns: {sector_level1_cols[:5]}")
print(f"Sample sector level 2 columns: {sector_level2_cols[:5]}")
print(f"Sample macro columns: {mac_cols[:5]}")

# Final verification - check for missing data in key areas
print(f"\n=== Final Verification ===")
verification = final_df.select([
    pl.len().alias("total_rows"),
    pl.col("year").is_null().sum().alias("null_years"),
    pl.col("firm_ico").is_null().sum().alias("null_firm_ico") if "firm_ico" in final_df.columns else pl.lit(0).alias("null_firm_ico"),
    pl.col(sector_level1_cols[0]).is_null().sum().alias(f"null_{sector_level1_cols[0][:20]}") if sector_level1_cols else pl.lit(0).alias("null_sector_level1_sample"),
    pl.col(sector_level2_cols[0]).is_null().sum().alias(f"null_{sector_level2_cols[0][:20]}") if sector_level2_cols else pl.lit(0).alias("null_sector_level2_sample"),
    pl.col(mac_cols[0]).is_null().sum().alias(f"null_{mac_cols[0]}") if mac_cols else pl.lit(0).alias("null_mac_sample")
])

print(verification)

# Check for duplicate firm-year observations
if "firm_ico" in final_df.columns:
    duplicate_rows = final_df.group_by(["firm_ico", "year"]).len().filter(pl.col("len") > 1)
    num_duplicates = duplicate_rows.height
    print(f"\nDuplicate firm-year check: Found {num_duplicates} duplicate firm-year observations.")
    if num_duplicates > 0:
        print("Sample of duplicate firm-year pairs:")
        print(duplicate_rows.head())

print(f"\nMerge process completed successfully!")
print(f"Final dataset contains {final_df.shape[0]:,} rows and {final_df.shape[1]} columns")

=== Third Merge: Adding Macro data ===
Final merged data - Shape: (661796, 103)
Total columns: 103

Column breakdown:
  - Firm columns: 60
  - Sector Level 1 columns: 3
  - Sector Level 2 columns: 3
  - Macro columns: 37

Sample sector level 1 columns: ['sector_level1_avg_wages_by_nace', 'sector_level1_no_of_employees_by_nace', 'sector_level1_ppi_by_nace']
Sample sector level 2 columns: ['sector_level2_ppi_by_nace', 'sector_level2_avg_wages_by_nace', 'sector_level2_no_of_employees_by_nace']
Sample macro columns: ['mac_cnb_repo_rate_annual', 'mac_hicp_dec', 'mac_hicp_overall_roc', 'mac_hicp_pure_energy_roc', 'mac_hicp_energy_full_roc']

=== Final Verification ===
shape: (1, 6)
┌────────────┬────────────┬───────────────┬──────────────────┬──────────────────┬──────────────────┐
│ total_rows ┆ null_years ┆ null_firm_ico ┆ null_sector_leve ┆ null_sector_leve ┆ null_mac_cnb_rep │
│ ---        ┆ ---        ┆ ---           ┆ l1_avg_wa        ┆ l2_ppi_by        ┆ o_rate_annual    │
│ u32       

In [81]:
# Examine sample rows to verify data alignment
print("=== Sample Data Verification ===")

# Show a few sample rows with key columns including the new English name columns
key_columns = ["firm_ico", "year", "level1_code", "level2_code"] if "firm_ico" in final_df.columns else ["year", "level1_code", "level2_code"]
sample_cols = (key_columns + 
               ["level1_nace_en_name", "level2_nace_en_name"] +
               sector_level1_cols[:2] + 
               sector_level2_cols[:2] + 
               mac_cols[:2])
available_sample_cols = [col for col in sample_cols if col in final_df.columns]

print(f"Examining columns: {available_sample_cols}")
sample_data = final_df.select(available_sample_cols).head(5)
print(sample_data)

# Check data coverage by year
year_coverage = final_df.select([
    pl.col("year"),
    pl.len().alias("count")
]).group_by("year").sum().sort("year")

print(f"\nData coverage by year:")
print(year_coverage)

# Check sample of the new name columns
print(f"\n=== NACE English Name Verification ===")
if "level1_nace_en_name" in final_df.columns and "level2_nace_en_name" in final_df.columns:
    name_sample = final_df.select([
        "level1_code", "level1_nace_en_name", 
        "level2_code", "level2_nace_en_name"
    ]).unique().limit(10)
    print("Sample NACE codes and names:")
    print(name_sample)

# Save the final merged dataset
final_df.write_parquet(output_path)
print(f"\nFinal merged dataset saved to: {output_path}")

print(f"\n🎉 Merge process completed successfully!")
print(f"   📊 {final_df.shape[0]:,} rows × {final_df.shape[1]} columns")
print(f"   🏢 {len(firm_cols)} firm-level variables")  
print(f"   🏭 {len(sector_level1_cols)} sector-level 1 variables")
print(f"   🏭 {len(sector_level2_cols)} sector-level 2 variables")
print(f"   📈 {len(mac_cols)} macro-level variables")
print(f"   📝 2 NACE English name columns (level1_nace_en_name, level2_nace_en_name)")

=== Sample Data Verification ===
Examining columns: ['firm_ico', 'year', 'level1_code', 'level2_code', 'level1_nace_en_name', 'level2_nace_en_name', 'sector_level1_avg_wages_by_nace', 'sector_level1_no_of_employees_by_nace', 'sector_level2_ppi_by_nace', 'sector_level2_avg_wages_by_nace', 'mac_cnb_repo_rate_annual', 'mac_hicp_dec']
shape: (5, 12)
┌──────────┬──────┬────────────┬────────────┬───┬────────────┬────────────┬────────────┬───────────┐
│ firm_ico ┆ year ┆ level1_cod ┆ level2_cod ┆ … ┆ sector_lev ┆ sector_lev ┆ mac_cnb_re ┆ mac_hicp_ │
│ ---      ┆ ---  ┆ e          ┆ e          ┆   ┆ el2_ppi_by ┆ el2_avg_wa ┆ po_rate_an ┆ dec       │
│ str      ┆ i16  ┆ ---        ┆ ---        ┆   ┆ _nace      ┆ ges_by_nac ┆ nual       ┆ ---       │
│          ┆      ┆ str        ┆ str        ┆   ┆ ---        ┆ …          ┆ ---        ┆ f64       │
│          ┆      ┆            ┆            ┆   ┆ f64        ┆ ---        ┆ f64        ┆           │
│          ┆      ┆            ┆            ┆ 