In [8]:
import matplotlib.pyplot as plt
import numpy as np
import polars as pl
import os

# Load the data using lazy evaluation for better performance
main_path = os.path.join("..", "data", "source_cleaned", "magnusweb_panel_imputed.parquet")
#main_path = os.path.join("..", "data", "source_cleaned", "magnusweb_panel_hq.parquet")

# Output path for the final merged dataset
output_path = os.path.join("..", "data", "data_ready", "merged_panel_imputed.parquet")
#output_path = os.path.join("..", "data", "data_ready", "merged_panel_hq.parquet")

# Using scan_parquet for lazy loading
main_df = pl.scan_parquet(main_path)
#hq_df = pl.scan_parquet(hq_path)

nace_propagated_path = os.path.join("..", "data", "source_cleaned", "data_by_nace_annual_tidy_propagated.parquet")
nace_propagated_df = pl.scan_parquet(nace_propagated_path)

macro_indicators_path = os.path.join("..", "data", "source_cleaned", "economy_annual_tidy.parquet")
macro_indicators_df = pl.scan_parquet(macro_indicators_path)

print("Data loaded using lazy evaluation")    


Data loaded using lazy evaluation


In [9]:
# Explore the structure of each dataset
print("=== Main DataFrame Structure ===")
main_sample = main_df.limit(0).collect()  # Get structure without data
print(f"Main columns: {main_sample.columns}")

print("\n=== NACE Propagated DataFrame Structure ===")
nace_sample = nace_propagated_df.limit(5).collect()
print(f"NACE columns: {nace_sample.columns}")
print(f"NACE sample shape: {nace_sample.shape}")
print(nace_sample)

print("\n=== Macro Indicators DataFrame Structure ===")
macro_sample = macro_indicators_df.limit(5).collect()
print(f"Macro columns: {macro_sample.columns}")
print(f"Macro sample shape: {macro_sample.shape}")
print(macro_sample)

=== Main DataFrame Structure ===
Main columns: ['ico', 'year', 'other_liabilities', 'costs', 'sales_revenue', 'equity', 'profit_net', 'turnover', 'current_assets', 'oper_profit', 'total_liabilities', 'total_assets', 'total_liabilities_and_equity', 'profit_pre_tax', 'other_assets', 'fixed_assets', 'name', 'main_nace', 'main_nace_code', 'sub_nace_cz', 'sub_nace_cz_code', 'main_okec', 'main_okec_code', 'sub_okec', 'sub_okec_code', 'esa2010', 'esa95', 'locality', 'region', 'num_employees', 'num_employees_cat', 'turnover_cat', 'audit', 'consolidation', 'currency', 'date_founded', 'date_dissolved', 'status', 'legal_form', 'entity_type', 'year_founded', 'year_dissolved', 'is_dissolved', 'operating_margin_cal', 'net_margin_cal', 'roa_ebit_cal', 'roe_cal', 'equity_ratio_cal', 'cost_ratio_cal', 'asset_turnover_cal', 'labor_productivity_cal', 'effective_tax_rate_cal', 'level1_code', 'level2_code', 'name_czso_en', 'industry_flag']

=== NACE Propagated DataFrame Structure ===
NACE columns: ['czso_c

In [10]:
# Transform NACE data from long to wide format and add sector_ prefix
print("=== Transforming NACE data ===")

# First, let's see what metrics we have in the NACE data
nace_metrics = nace_propagated_df.select("metric").unique().collect()
print(f"Available NACE metrics: {nace_metrics['metric'].to_list()}")

# Transform NACE data from long to wide format
# We need to pivot on 'metric' to get separate columns for each metric
# For lazy frames, we need to collect first or use a different approach
nace_wide = nace_propagated_df.collect().pivot(
    index=["czso_code", "magnus_nace", "level", "year"],
    columns="metric", 
    values="value"
)

# Add sector_ prefix to metric columns (all columns except the index ones)
index_cols = ["czso_code", "magnus_nace", "level", "year"]
nace_renamed = nace_wide.rename({
    col: f"sector_{col}" for col in nace_wide.columns 
    if col not in index_cols
})

# Convert back to lazy frame for efficient joining
nace_renamed = pl.LazyFrame(nace_renamed)

print("NACE data transformed to wide format with sector_ prefix")
print(f"Transformed columns: {nace_renamed.collect_schema().names()}")

=== Transforming NACE data ===
Available NACE metrics: ['avg_wages_by_nace', 'no_of_employees_by_nace', 'ppi_by_nace']
NACE data transformed to wide format with sector_ prefix
Transformed columns: ['czso_code', 'magnus_nace', 'level', 'year', 'sector_avg_wages_by_nace', 'sector_no_of_employees_by_nace', 'sector_ppi_by_nace']


  nace_wide = nace_propagated_df.collect().pivot(


In [11]:
# Transform macro data from long to wide format and add mac_ prefix
print("=== Transforming Macro data ===")

# First, let's see what metrics we have in the macro data
macro_metrics = macro_indicators_df.select("metric").unique().collect()
print(f"Available macro metrics: {macro_metrics['metric'].to_list()}")

# Transform macro data from long to wide format
# For lazy frames, we need to collect first
macro_wide = macro_indicators_df.collect().pivot(
    index=["year"],
    columns="metric",
    values="value"
)

# Add mac_ prefix to metric columns (all columns except year)
macro_renamed = macro_wide.rename({
    col: f"mac_{col}" for col in macro_wide.columns 
    if col != "year"
})

# Convert back to lazy frame for efficient joining
macro_renamed = pl.LazyFrame(macro_renamed)

print("Macro data transformed to wide format with mac_ prefix")
print(f"Transformed columns: {macro_renamed.collect_schema().names()}")

=== Transforming Macro data ===
Available macro metrics: ['NLGXQ', 'no_of_employees_ths', 'MPEN', 'UNR', 'ULCDR', 'gdp_2020_base_prices', 'import_price_index_ex_energy', 'NOOQ', 'HRS', 'IRL', 'IRS', 'GAP', 'EXCH', 'CPV_ANNPCT', 'unemp_rate', 'ITV_ANNPCT', 'cnb_repo_rate_annual', 'gdp_nominal_prices', 'hicp_pure_energy_roc', 'ULC', 'deflator_nominal', 'gdp_2020_base_prices_sopr', 'hicp_dec', 'hicp_energy_full_roc', 'deflator_base_2020', 'CPI_YTYPCT', 'nom_gr_avg_wage_czk', 'KTPV_ANNPCT', 'RPMGS', 'EXCHEB', 'GGFLMQ', 'hicp_overall_roc', 'TTRADE', 'fx_czk_eur_annual_avg', 'PCORE_YTYPCT', 'FBGSQ', 'PDTY']
Macro data transformed to wide format with mac_ prefix
Transformed columns: ['year', 'mac_cnb_repo_rate_annual', 'mac_hicp_dec', 'mac_hicp_overall_roc', 'mac_hicp_pure_energy_roc', 'mac_hicp_energy_full_roc', 'mac_nom_gr_avg_wage_czk', 'mac_no_of_employees_ths', 'mac_gdp_nominal_prices', 'mac_gdp_2020_base_prices', 'mac_gdp_2020_base_prices_sopr', 'mac_deflator_nominal', 'mac_deflator_bas

  macro_wide = macro_indicators_df.collect().pivot(


In [12]:
# Add firm_ prefix to all columns in main_df (except year which is needed for joining)
print("=== Adding firm_ prefix to firm-level columns ===")

# Get the columns from main_df that need the firm_ prefix
main_cols_sample = main_df.limit(0).collect().columns
print(f"Original main columns count: {len(main_cols_sample)}")

# Create rename mapping for all columns except 'year' and join keys
firm_rename_map = {}
for col in main_cols_sample:
    if col not in ['year', 'level2_code']:  # Keep join keys unchanged
        firm_rename_map[col] = f"firm_{col}"

print(f"Renaming {len(firm_rename_map)} columns with firm_ prefix")

# Apply the renaming to main_df
main_df_renamed = main_df.rename(firm_rename_map)

print("Firm columns renamed successfully!")

# Step 1: Merge main_df with NACE data on level2_code and year
print("=== First Merge: Main + NACE data ===")

# Check if main_df has level2_code and year columns
main_cols_renamed = main_df_renamed.limit(0).collect().columns
print(f"Main df columns (sample): {main_cols_renamed[:10]}...")

# Check if we have the right join keys
if 'level2_code' not in main_cols_renamed:
    print("WARNING: level2_code not found in main_df columns")
    print("Available columns that might be the join key:")
    for col in main_cols_renamed:
        if 'level' in col.lower() or 'nace' in col.lower() or 'code' in col.lower():
            print(f"  - {col}")

# Perform the merge (left join to keep all main data)
merged_step1 = main_df_renamed.join(
    nace_renamed,
    left_on=["level2_code", "year"],
    right_on=["czso_code", "year"],
    how="left"
)

# Verify the merge
step1_sample = merged_step1.limit(3).collect()
print(f"After NACE merge - Shape: {step1_sample.shape}")
print(f"Columns count: {len(step1_sample.columns)}")
print(f"Sample of sector columns: {[col for col in step1_sample.columns if col.startswith('sector_')][:5]}")

# Check for any missing joins
null_check = merged_step1.select([
    pl.col("level2_code").is_null().sum().alias("null_level2_code"),
    pl.col("year").is_null().sum().alias("null_year"),
    pl.col("sector_avg_wages_by_nace").is_null().sum().alias("null_sector_wages") if "sector_avg_wages_by_nace" in merged_step1.collect_schema().names() else pl.lit(0).alias("null_sector_wages")
]).collect()
print(f"Null check after NACE merge: {null_check}")

print("First merge completed successfully!")

=== Adding firm_ prefix to firm-level columns ===
Original main columns count: 56
Renaming 54 columns with firm_ prefix
Firm columns renamed successfully!
=== First Merge: Main + NACE data ===
Main df columns (sample): ['firm_ico', 'year', 'firm_other_liabilities', 'firm_costs', 'firm_sales_revenue', 'firm_equity', 'firm_profit_net', 'firm_turnover', 'firm_current_assets', 'firm_oper_profit']...
After NACE merge - Shape: (3, 61)
Columns count: 61
Sample of sector columns: ['sector_avg_wages_by_nace', 'sector_no_of_employees_by_nace', 'sector_ppi_by_nace']
Null check after NACE merge: shape: (1, 3)
┌──────────────────┬───────────┬───────────────────┐
│ null_level2_code ┆ null_year ┆ null_sector_wages │
│ ---              ┆ ---       ┆ ---               │
│ u32              ┆ u32       ┆ u32               │
╞══════════════════╪═══════════╪═══════════════════╡
│ 1501             ┆ 0         ┆ 4647              │
└──────────────────┴───────────┴───────────────────┘
First merge completed su

In [13]:
# Step 2: Merge with macro data on year
print("=== Second Merge: Adding Macro data ===")

# Perform the second merge (left join to keep all existing data)
merged_final = merged_step1.join(
    macro_renamed,
    on="year",
    how="left"
)

# Collect the final result for verification
final_df = merged_final.collect()
print(f"Final merged data - Shape: {final_df.shape}")
print(f"Total columns: {len(final_df.columns)}")

# Categorize columns by prefix
firm_cols = [col for col in final_df.columns if not col.startswith(('sector_', 'mac_'))]
sector_cols = [col for col in final_df.columns if col.startswith('sector_')]
mac_cols = [col for col in final_df.columns if col.startswith('mac_')]

print(f"\nColumn breakdown:")
print(f"  - Firm columns: {len(firm_cols)}")
print(f"  - Sector columns: {len(sector_cols)}")
print(f"  - Macro columns: {len(mac_cols)}")

print(f"\nSample sector columns: {sector_cols[:5]}")
print(f"Sample macro columns: {mac_cols[:5]}")

# Final verification - check for missing data in key areas
print(f"\n=== Final Verification ===")
verification = final_df.select([
    pl.len().alias("total_rows"),
    pl.col("year").is_null().sum().alias("null_years"),
    pl.col("firm_ico").is_null().sum().alias("null_firm_ico") if "firm_ico" in final_df.columns else pl.lit(0).alias("null_firm_ico"),
    pl.col(sector_cols[0]).is_null().sum().alias(f"null_{sector_cols[0]}") if sector_cols else pl.lit(0).alias("null_sector_sample"),
    pl.col(mac_cols[0]).is_null().sum().alias(f"null_{mac_cols[0]}") if mac_cols else pl.lit(0).alias("null_mac_sample")
])

print(verification)

print(f"\nMerge process completed successfully!")
print(f"Final dataset contains {final_df.shape[0]:,} rows and {final_df.shape[1]} columns")

=== Second Merge: Adding Macro data ===
Final merged data - Shape: (665057, 98)
Total columns: 98

Column breakdown:
  - Firm columns: 58
  - Sector columns: 3
  - Macro columns: 37

Sample sector columns: ['sector_avg_wages_by_nace', 'sector_no_of_employees_by_nace', 'sector_ppi_by_nace']
Sample macro columns: ['mac_cnb_repo_rate_annual', 'mac_hicp_dec', 'mac_hicp_overall_roc', 'mac_hicp_pure_energy_roc', 'mac_hicp_energy_full_roc']

=== Final Verification ===
shape: (1, 5)
┌────────────┬────────────┬───────────────┬────────────────────────────┬───────────────────────────┐
│ total_rows ┆ null_years ┆ null_firm_ico ┆ null_sector_avg_wages_by_n ┆ null_mac_cnb_repo_rate_an │
│ ---        ┆ ---        ┆ ---           ┆ ace                        ┆ nual                      │
│ u32        ┆ u32        ┆ u32           ┆ ---                        ┆ ---                       │
│            ┆            ┆               ┆ u32                        ┆ u32                       │
╞════════════╪═

In [14]:
# Examine sample rows to verify data alignment
print("=== Sample Data Verification ===")

# Show a few sample rows with key columns
key_columns = ["firm_ico", "year", "level2_code"] if "firm_ico" in final_df.columns else ["year", "level2_code"]
sample_cols = key_columns + sector_cols[:3] + mac_cols[:3]
available_sample_cols = [col for col in sample_cols if col in final_df.columns]

print(f"Examining columns: {available_sample_cols}")
sample_data = final_df.select(available_sample_cols).head(5)
print(sample_data)

# Check data coverage by year
year_coverage = final_df.select([
    pl.col("year"),
    pl.len().alias("count")
]).group_by("year").sum().sort("year")

print(f"\nData coverage by year:")
print(year_coverage)

# Save the final merged dataset

final_df.write_parquet(output_path)
print(f"\nFinal merged dataset saved to: {output_path}")

print(f"\n🎉 Merge process completed successfully!")
print(f"   📊 {final_df.shape[0]:,} rows × {final_df.shape[1]} columns")
print(f"   🏢 {len(firm_cols)} firm-level variables")  
print(f"   🏭 {len(sector_cols)} sector-level variables")
print(f"   📈 {len(mac_cols)} macro-level variables")

=== Sample Data Verification ===
Examining columns: ['firm_ico', 'year', 'level2_code', 'sector_avg_wages_by_nace', 'sector_no_of_employees_by_nace', 'sector_ppi_by_nace', 'mac_cnb_repo_rate_annual', 'mac_hicp_dec', 'mac_hicp_overall_roc']
shape: (5, 9)
┌──────────┬──────┬────────────┬────────────┬───┬────────────┬────────────┬────────────┬───────────┐
│ firm_ico ┆ year ┆ level2_cod ┆ sector_avg ┆ … ┆ sector_ppi ┆ mac_cnb_re ┆ mac_hicp_d ┆ mac_hicp_ │
│ ---      ┆ ---  ┆ e          ┆ _wages_by_ ┆   ┆ _by_nace   ┆ po_rate_an ┆ ec         ┆ overall_r │
│ str      ┆ i16  ┆ ---        ┆ nace       ┆   ┆ ---        ┆ nual       ┆ ---        ┆ oc        │
│          ┆      ┆ str        ┆ ---        ┆   ┆ f64        ┆ ---        ┆ f64        ┆ ---       │
│          ┆      ┆            ┆ f64        ┆   ┆            ┆ f64        ┆            ┆ f64       │
╞══════════╪══════╪════════════╪════════════╪═══╪════════════╪════════════╪════════════╪═══════════╡
│ 00000795 ┆ 2000 ┆ 46         ┆ 12570.