In [56]:
import matplotlib.pyplot as plt
import numpy as np
import polars as pl
import os

# Load the data using lazy evaluation for better performance
main_path = os.path.join("..", "data", "source_cleaned", "magnusweb_panel_imputed.parquet")
#main_path = os.path.join("..", "data", "source_cleaned", "magnusweb_panel_hq.parquet")

# Output path for the final merged dataset
output_path = os.path.join("..", "data", "data_ready", "merged_panel_imputed.parquet")
#output_path = os.path.join("..", "data", "data_ready", "merged_panel_hq.parquet")

# Using scan_parquet for lazy loading
main_df = pl.scan_parquet(main_path)
#hq_df = pl.scan_parquet(hq_path)

nace_propagated_path = os.path.join("..", "data", "source_cleaned", "data_by_nace_annual_tidy_propagated.parquet")
nace_propagated_df = pl.scan_parquet(nace_propagated_path)

macro_indicators_path = os.path.join("..", "data", "source_cleaned", "economy_annual_tidy.parquet")
macro_indicators_df = pl.scan_parquet(macro_indicators_path)

print("Data loaded using lazy evaluation")    


Data loaded using lazy evaluation


In [57]:
# Explore the structure of each dataset
print("=== Main DataFrame Structure ===")
main_sample = main_df.limit(0).collect()  # Get structure without data
print(f"Main columns: {main_sample.columns}")

print("\n=== NACE Propagated DataFrame Structure ===")
nace_sample = nace_propagated_df.limit(5).collect()
print(f"NACE columns: {nace_sample.columns}")
print(f"NACE sample shape: {nace_sample.shape}")
print(nace_sample)

print("\n=== Macro Indicators DataFrame Structure ===")
macro_sample = macro_indicators_df.limit(5).collect()
print(f"Macro columns: {macro_sample.columns}")
print(f"Macro sample shape: {macro_sample.shape}")
print(macro_sample)

=== Main DataFrame Structure ===
Main columns: ['ico', 'year', 'other_liabilities', 'costs', 'sales_revenue', 'equity', 'profit_net', 'turnover', 'current_assets', 'oper_profit', 'total_liabilities', 'total_assets', 'total_liabilities_and_equity', 'profit_pre_tax', 'other_assets', 'fixed_assets', 'name', 'main_nace', 'main_nace_code', 'sub_nace_cz', 'sub_nace_cz_code', 'main_okec', 'main_okec_code', 'sub_okec', 'sub_okec_code', 'esa2010', 'esa95', 'locality', 'region', 'num_employees', 'num_employees_cat', 'turnover_cat', 'audit', 'consolidation', 'currency', 'date_founded', 'date_dissolved', 'status', 'legal_form', 'entity_type', 'year_founded', 'year_dissolved', 'is_dissolved', 'operating_margin_cal', 'net_margin_cal', 'roa_ebit_cal', 'roe_cal', 'equity_ratio_cal', 'cost_ratio_cal', 'asset_turnover_cal', 'labor_productivity_cal', 'effective_tax_rate_cal', 'level1_code', 'level2_code', 'name_czso_en', 'industry_flag']

=== NACE Propagated DataFrame Structure ===
NACE columns: ['czso_c

In [58]:
# Load NACE matching table for proper level1_code and level2_code mapping
print("=== Loading NACE Matching Table ===")
nace_matching_path = os.path.join("..", "data", "source_cleaned", "t_nace_matching.parquet")
nace_matching_df = pl.scan_parquet(nace_matching_path)

# Transform NACE data from long to wide format and add level-specific prefixes
print("=== Transforming NACE data ===")

# First, let's see what metrics we have in the NACE data
nace_metrics = nace_propagated_df.select("metric").unique().collect()
print(f"Available NACE metrics: {nace_metrics['metric'].to_list()}")

# Transform NACE Level 1 data from long to wide format
print("\n--- Processing Level 1 NACE data ---")
# Filter for level 1 and collect, then pivot with metrics as columns
nace_level1_long = nace_propagated_df.filter(pl.col("level") == 1).collect()

# Verify uniqueness of czso_code + year combinations for level 1
level1_unique_check = nace_level1_long.group_by(["czso_code", "year"]).len()
max_records_per_combo = level1_unique_check.select(pl.col("len").max()).item()
expected_metrics = len(nace_metrics['metric'].to_list())
print(f"Level 1: Max records per (czso_code, year): {max_records_per_combo}, Expected metrics: {expected_metrics}")

# Pivot to wide format with metrics as columns
# CRITICAL FIX: Remove name_en from pivot index to prevent duplicates
nace_level1 = nace_level1_long.pivot(
    index=["czso_code", "year"],  # Only unique identifiers
    on="metric",
    values="value"
)

# Get the name mapping separately (taking first name for each czso_code, year)
nace_level1_names = (nace_level1_long
                     .select(["czso_code", "year", "name_en"])
                     .unique(subset=["czso_code", "year"], keep="first"))

# Join names back to the pivoted data
nace_level1_with_names = nace_level1.join(
    nace_level1_names,
    on=["czso_code", "year"],
    how="left"
)

# Add sector_level1_ prefix to metric columns
metric_cols = nace_metrics['metric'].to_list()
nace_level1_renamed = nace_level1_with_names.rename({
    col: f"sector_level1_{col}" for col in metric_cols
})

# Rename name_en to proper suffix
nace_level1_renamed = nace_level1_renamed.rename({"name_en": "level1_nace_en_name"})

print(f"Level 1 NACE data transformed: {nace_level1_renamed.columns}")
print(f"Level 1 shape: {nace_level1_renamed.shape}")

# Transform NACE Level 2 data from long to wide format
print("\n--- Processing Level 2 NACE data ---")
nace_level2_long = nace_propagated_df.filter(pl.col("level") == 2).collect()

# Verify uniqueness of czso_code + year combinations for level 2  
level2_unique_check = nace_level2_long.group_by(["czso_code", "year"]).len()
max_records_per_combo = level2_unique_check.select(pl.col("len").max()).item()
print(f"Level 2: Max records per (czso_code, year): {max_records_per_combo}, Expected metrics: {expected_metrics}")

# Pivot to wide format with metrics as columns
# CRITICAL FIX: Remove name_en from pivot index to prevent duplicates
nace_level2 = nace_level2_long.pivot(
    index=["czso_code", "year"],  # Only unique identifiers
    on="metric",
    values="value"
)

# Get the name mapping separately (taking first name for each czso_code, year)
nace_level2_names = (nace_level2_long
                     .select(["czso_code", "year", "name_en"])
                     .unique(subset=["czso_code", "year"], keep="first"))

# Join names back to the pivoted data
nace_level2_with_names = nace_level2.join(
    nace_level2_names,
    on=["czso_code", "year"],
    how="left"
)

# Add sector_level2_ prefix to metric columns
nace_level2_renamed = nace_level2_with_names.rename({
    col: f"sector_level2_{col}" for col in metric_cols
})

# Rename name_en to proper suffix
nace_level2_renamed = nace_level2_renamed.rename({"name_en": "level2_nace_en_name"})

print(f"Level 2 NACE data transformed: {nace_level2_renamed.columns}")
print(f"Level 2 shape: {nace_level2_renamed.shape}")

# Convert back to lazy frames for efficient joining
nace_level1_renamed = pl.LazyFrame(nace_level1_renamed)
nace_level2_renamed = pl.LazyFrame(nace_level2_renamed)

print("\nNACE data transformed to wide format with level-specific prefixes")
print("CRITICAL FIX: Removed name_en from pivot index to ensure unique (czso_code, year) combinations")

=== Loading NACE Matching Table ===
=== Transforming NACE data ===
Available NACE metrics: ['ppi_by_nace', 'avg_wages_by_nace', 'no_of_employees_by_nace']

--- Processing Level 1 NACE data ---
Level 1: Max records per (czso_code, year): 3, Expected metrics: 3
Level 1 NACE data transformed: ['czso_code', 'year', 'sector_level1_avg_wages_by_nace', 'sector_level1_no_of_employees_by_nace', 'sector_level1_ppi_by_nace', 'level1_nace_en_name']
Level 1 shape: (475, 6)

--- Processing Level 2 NACE data ---
Level 2: Max records per (czso_code, year): 3, Expected metrics: 3
Level 2 NACE data transformed: ['czso_code', 'year', 'sector_level2_ppi_by_nace', 'sector_level2_avg_wages_by_nace', 'sector_level2_no_of_employees_by_nace', 'level2_nace_en_name']
Level 2 shape: (2125, 6)

NACE data transformed to wide format with level-specific prefixes
CRITICAL FIX: Removed name_en from pivot index to ensure unique (czso_code, year) combinations


In [59]:
# CRITICAL DIAGNOSTIC: Verify NACE data is properly pivoted before joins
print("=== VERIFYING NACE DATA FOR JOINS ===")

# Check level 1 data structure
if 'nace_level1_renamed' in locals():
    level1_sample = nace_level1_renamed.collect()
    print(f"Level 1 NACE data shape: {level1_sample.shape}")
    print(f"Columns: {level1_sample.columns}")
    
    # Check for specific case that was causing problems
    test_case = level1_sample.filter(pl.col("czso_code") == "G").filter(pl.col("year") == 2020)
    print(f"Test case (czso_code=G, year=2020): {test_case.shape[0]} rows")
    if test_case.shape[0] > 0:
        print("Sample:")
        print(test_case)
    
    # Check uniqueness
    unique_combos = level1_sample.select(["czso_code", "year"]).n_unique()
    total_rows = level1_sample.height
    print(f"Level 1 unique (czso_code, year) combinations: {unique_combos}")
    print(f"Level 1 total rows: {total_rows}")
    if unique_combos == total_rows:
        print("✅ Level 1 NACE data is properly unique - good for joining!")
    else:
        print("❌ Level 1 NACE data still has duplicates!")
        
else:
    print("❌ nace_level1_renamed not available")

print()

# Check level 2 data structure  
if 'nace_level2_renamed' in locals():
    level2_sample = nace_level2_renamed.collect()
    print(f"Level 2 NACE data shape: {level2_sample.shape}")
    print(f"Columns: {level2_sample.columns}")
    
    # Check uniqueness
    unique_combos = level2_sample.select(["czso_code", "year"]).n_unique()
    total_rows = level2_sample.height
    print(f"Level 2 unique (czso_code, year) combinations: {unique_combos}")
    print(f"Level 2 total rows: {total_rows}")
    if unique_combos == total_rows:
        print("✅ Level 2 NACE data is properly unique - good for joining!")
    else:
        print("❌ Level 2 NACE data still has duplicates!")
else:
    print("❌ nace_level2_renamed not available")

=== VERIFYING NACE DATA FOR JOINS ===
Level 1 NACE data shape: (475, 6)
Columns: ['czso_code', 'year', 'sector_level1_avg_wages_by_nace', 'sector_level1_no_of_employees_by_nace', 'sector_level1_ppi_by_nace', 'level1_nace_en_name']
Test case (czso_code=G, year=2020): 1 rows
Sample:
shape: (1, 6)
┌───────────┬──────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┐
│ czso_code ┆ year ┆ sector_level1_avg ┆ sector_level1_no_ ┆ sector_level1_ppi ┆ level1_nace_en_na │
│ ---       ┆ ---  ┆ _wages_by_nac…    ┆ of_employees_…    ┆ _by_nace          ┆ me                │
│ str       ┆ i64  ┆ ---               ┆ ---               ┆ ---               ┆ ---               │
│           ┆      ┆ f64               ┆ f64               ┆ f64               ┆ str               │
╞═══════════╪══════╪═══════════════════╪═══════════════════╪═══════════════════╪═══════════════════╡
│ G         ┆ 2020 ┆ 33482.0           ┆ 496.8             ┆ null              ┆ Wholesale and    

In [60]:
# Transform macro data from long to wide format and add mac_ prefix
print("=== Transforming Macro data ===")

# First, let's see what metrics we have in the macro data
macro_metrics = macro_indicators_df.select("metric").unique().collect()
print(f"Available macro metrics: {macro_metrics['metric'].to_list()}")

# Transform macro data from long to wide format
# For lazy frames, we need to collect first
macro_wide = macro_indicators_df.collect().pivot(
    index=["year"],
    on="metric",  # Updated from 'columns' to 'on'
    values="value"
)

# Add mac_ prefix to metric columns (all columns except year)
macro_renamed = macro_wide.rename({
    col: f"mac_{col}" for col in macro_wide.columns 
    if col != "year"
})

# Convert back to lazy frame for efficient joining
macro_renamed = pl.LazyFrame(macro_renamed)

print("Macro data transformed to wide format with mac_ prefix")
print(f"Transformed columns: {macro_renamed.collect_schema().names()}")

=== Transforming Macro data ===
Available macro metrics: ['unemp_rate', 'gdp_nominal_prices', 'CPI_YTYPCT', 'deflator_nominal', 'no_of_employees_ths', 'NLGXQ', 'PDTY', 'ITV_ANNPCT', 'HRS', 'IRL', 'EXCHEB', 'NOOQ', 'hicp_dec', 'UNR', 'PCORE_YTYPCT', 'EXCH', 'fx_czk_eur_annual_avg', 'gdp_2020_base_prices', 'ULCDR', 'hicp_overall_roc', 'import_price_index_ex_energy', 'ULC', 'hicp_pure_energy_roc', 'deflator_base_2020', 'gdp_2020_base_prices_sopr', 'MPEN', 'CPV_ANNPCT', 'FBGSQ', 'cnb_repo_rate_annual', 'RPMGS', 'TTRADE', 'IRS', 'nom_gr_avg_wage_czk', 'hicp_energy_full_roc', 'GGFLMQ', 'KTPV_ANNPCT', 'GAP']
Macro data transformed to wide format with mac_ prefix
Transformed columns: ['year', 'mac_cnb_repo_rate_annual', 'mac_hicp_dec', 'mac_hicp_overall_roc', 'mac_hicp_pure_energy_roc', 'mac_hicp_energy_full_roc', 'mac_nom_gr_avg_wage_czk', 'mac_no_of_employees_ths', 'mac_gdp_nominal_prices', 'mac_gdp_2020_base_prices', 'mac_gdp_2020_base_prices_sopr', 'mac_deflator_nominal', 'mac_deflator_bas

In [61]:
# Add firm_ prefix to all columns in main_df (except year and join keys)
print("=== Adding firm_ prefix to firm-level columns ===")

# Get the columns from main_df that need the firm_ prefix
main_cols_sample = main_df.limit(0).collect().columns
print(f"Original main columns count: {len(main_cols_sample)}")

# Create rename mapping for all columns except 'year' and join keys
firm_rename_map = {}
for col in main_cols_sample:
    if col not in ['year', 'level1_code', 'level2_code']:  # Keep join keys unchanged
        firm_rename_map[col] = f"firm_{col}"

print(f"Renaming {len(firm_rename_map)} columns with firm_ prefix")

# Apply the renaming to main_df
main_df_renamed = main_df.rename(firm_rename_map)

print("Firm columns renamed successfully!")

# Step 1: Merge main_df with Level 1 NACE data
print("=== First Merge: Main + Level 1 NACE data ===")

# The join strategy:
# - main_df has level1_code (like 'A', 'B', 'C', etc.)  
# - nace_level1_renamed has czso_code (also like 'A', 'B', 'C', etc. for level 1)
# - So we can join directly on level1_code = czso_code

# Check if we have the right join keys
main_cols_renamed = main_df_renamed.limit(0).collect().columns
print(f"Main df has level1_code: {'level1_code' in main_cols_renamed}")
print(f"Main df has level2_code: {'level2_code' in main_cols_renamed}")

# Perform the first merge with Level 1 NACE data (left join to keep all main data)
merged_step1 = main_df_renamed.join(
    nace_level1_renamed,
    left_on=["level1_code", "year"],
    right_on=["czso_code", "year"], 
    how="left"
)

# Verify the first merge
step1_sample = merged_step1.limit(3).collect()
print(f"After Level 1 NACE merge - Shape: {step1_sample.shape}")
level1_sector_cols = [col for col in step1_sample.columns if col.startswith('sector_level1_')]
print(f"Level 1 sector columns: {level1_sector_cols}")

# Step 2: Merge with Level 2 NACE data  
print("=== Second Merge: Adding Level 2 NACE data ===")

# For level 2, we need to match level2_code (like '01', '02', etc.) with czso_code
# Perform the second merge with Level 2 NACE data (left join to keep all existing data)
merged_step2 = merged_step1.join(
    nace_level2_renamed,
    left_on=["level2_code", "year"],
    right_on=["czso_code", "year"],
    how="left"
)

# Verify the second merge
step2_sample = merged_step2.limit(3).collect()
print(f"After Level 2 NACE merge - Shape: {step2_sample.shape}")
level2_sector_cols = [col for col in step2_sample.columns if col.startswith('sector_level2_')]
print(f"Level 2 sector columns: {level2_sector_cols}")

# Check for any missing joins - count nulls in key sector variables
level1_check_col = level1_sector_cols[0] if level1_sector_cols else None
level2_check_col = level2_sector_cols[0] if level2_sector_cols else None

null_check_cols = ["year"]
if level1_check_col:
    null_check_cols.append(level1_check_col)
if level2_check_col:
    null_check_cols.append(level2_check_col)

null_check = merged_step2.select([
    pl.col(col).is_null().sum().alias(f"null_{col}") for col in null_check_cols
]).collect()
print(f"Null check after NACE merges:")
for col in null_check.columns:
    print(f"  {col}: {null_check[col].item()}")

print("NACE merges completed successfully!")

=== Adding firm_ prefix to firm-level columns ===
Original main columns count: 56
Renaming 53 columns with firm_ prefix
Firm columns renamed successfully!
=== First Merge: Main + Level 1 NACE data ===
Main df has level1_code: True
Main df has level2_code: True
After Level 1 NACE merge - Shape: (3, 60)
Level 1 sector columns: ['sector_level1_avg_wages_by_nace', 'sector_level1_no_of_employees_by_nace', 'sector_level1_ppi_by_nace']
=== Second Merge: Adding Level 2 NACE data ===
After Level 2 NACE merge - Shape: (3, 64)
Level 2 sector columns: ['sector_level2_ppi_by_nace', 'sector_level2_avg_wages_by_nace', 'sector_level2_no_of_employees_by_nace']
Null check after NACE merges:
  null_year: 0
  null_sector_level1_avg_wages_by_nace: 2004
  null_sector_level2_ppi_by_nace: 40443
NACE merges completed successfully!


In [62]:
# Step 3: Merge with macro data on year
print("=== Third Merge: Adding Macro data ===")

# Check shapes before merge
print(f"Before macro merge - merged_step2 shape: {merged_step2.collect().shape}")

# Check macro data shape
macro_sample = macro_renamed.collect()
print(f"Macro data shape: {macro_sample.shape}")
print(f"Macro years: {macro_sample['year'].min()} to {macro_sample['year'].max()}")

# Check for duplicate years in macro data
macro_year_counts = macro_sample.select(pl.col("year").value_counts()).unnest("year")
duplicate_years = macro_year_counts.filter(pl.col("count") > 1)
if duplicate_years.height > 0:
    print(f"❌ PROBLEM: Duplicate years in macro data!")
    print(duplicate_years)
else:
    print(f"✓ Macro data has unique years")

# Perform the third merge (left join to keep all existing data)
merged_final = merged_step2.join(
    macro_renamed,
    on="year",
    how="left"
)

# Collect the final result for verification
final_df = merged_final.collect()
print(f"After macro merge - Final merged data Shape: {final_df.shape}")
print(f"Total columns: {len(final_df.columns)}")

# Check for unexpected row expansion
original_main_count = 663437  # Known from previous analysis
if final_df.shape[0] != original_main_count:
    print(f"❌ ROW EXPANSION DETECTED!")
    print(f"   Expected: {original_main_count:,} rows")
    print(f"   Actual: {final_df.shape[0]:,} rows")
    print(f"   Expansion factor: {final_df.shape[0] / original_main_count:.2f}x")
    
    # Check for duplicate firm-year combinations that might explain this
    if "firm_ico" in final_df.columns:
        ico_year_counts = final_df.group_by(["firm_ico", "year"]).len()
        max_dupes = ico_year_counts.select(pl.col("len").max()).item()
        print(f"   Max firm-year duplicates: {max_dupes}")
        if max_dupes > 1:
            print("   Sample duplicated firm-year pairs:")
            dupes = ico_year_counts.filter(pl.col("len") > 1).head(5)
            print(dupes)
else:
    print(f"✓ No unexpected row expansion - maintaining {original_main_count:,} rows")

# Categorize columns by prefix
firm_cols = [col for col in final_df.columns if not col.startswith(('sector_', 'mac_'))]
sector_level1_cols = [col for col in final_df.columns if col.startswith('sector_level1_')]
sector_level2_cols = [col for col in final_df.columns if col.startswith('sector_level2_')]
mac_cols = [col for col in final_df.columns if col.startswith('mac_')]

print(f"\nColumn breakdown:")
print(f"  - Firm columns: {len(firm_cols)}")
print(f"  - Sector Level 1 columns: {len(sector_level1_cols)}")
print(f"  - Sector Level 2 columns: {len(sector_level2_cols)}")
print(f"  - Macro columns: {len(mac_cols)}")

print(f"\nSample sector level 1 columns: {sector_level1_cols[:5]}")
print(f"Sample sector level 2 columns: {sector_level2_cols[:5]}")
print(f"Sample macro columns: {mac_cols[:5]}")

# Final verification - check for missing data in key areas
print(f"\n=== Final Verification ===")
verification = final_df.select([
    pl.len().alias("total_rows"),
    pl.col("year").is_null().sum().alias("null_years"),
    pl.col("firm_ico").is_null().sum().alias("null_firm_ico") if "firm_ico" in final_df.columns else pl.lit(0).alias("null_firm_ico"),
    pl.col(sector_level1_cols[0]).is_null().sum().alias(f"null_{sector_level1_cols[0][:20]}") if sector_level1_cols else pl.lit(0).alias("null_sector_level1_sample"),
    pl.col(sector_level2_cols[0]).is_null().sum().alias(f"null_{sector_level2_cols[0][:20]}") if sector_level2_cols else pl.lit(0).alias("null_sector_level2_sample"),
    pl.col(mac_cols[0]).is_null().sum().alias(f"null_{mac_cols[0]}") if mac_cols else pl.lit(0).alias("null_mac_sample")
])

print(verification)

# Check for duplicate firm-year observations
if "firm_ico" in final_df.columns:
    duplicate_rows = final_df.group_by(["firm_ico", "year"]).len().filter(pl.col("len") > 1)
    num_duplicates = duplicate_rows.height
    print(f"\nDuplicate firm-year check: Found {num_duplicates} duplicate firm-year observations.")
    if num_duplicates > 0:
        print("Sample of duplicate firm-year pairs:")
        print(duplicate_rows.head())

print(f"\nMerge process completed!")
print(f"Final dataset contains {final_df.shape[0]:,} rows and {final_df.shape[1]} columns")

=== Third Merge: Adding Macro data ===
Before macro merge - merged_step2 shape: (663296, 64)
Macro data shape: (26, 38)
Macro years: 1999 to 2024
✓ Macro data has unique years
After macro merge - Final merged data Shape: (663296, 101)
Total columns: 101
❌ ROW EXPANSION DETECTED!
   Expected: 663,437 rows
   Actual: 663,296 rows
   Expansion factor: 1.00x
   Max firm-year duplicates: 1

Column breakdown:
  - Firm columns: 58
  - Sector Level 1 columns: 3
  - Sector Level 2 columns: 3
  - Macro columns: 37

Sample sector level 1 columns: ['sector_level1_avg_wages_by_nace', 'sector_level1_no_of_employees_by_nace', 'sector_level1_ppi_by_nace']
Sample sector level 2 columns: ['sector_level2_ppi_by_nace', 'sector_level2_avg_wages_by_nace', 'sector_level2_no_of_employees_by_nace']
Sample macro columns: ['mac_cnb_repo_rate_annual', 'mac_hicp_dec', 'mac_hicp_overall_roc', 'mac_hicp_pure_energy_roc', 'mac_hicp_energy_full_roc']

=== Final Verification ===
shape: (1, 6)
┌────────────┬────────────

In [63]:
# MINIMAL JOIN TEST - Find the exact source of duplicates
print("=== MINIMAL JOIN TEST ===")

# Test just the first join - main_df + level 1 NACE
print("Testing Level 1 join only...")

# Take a small sample for testing
test_main = main_df_renamed.filter(pl.col("level1_code") == "G").filter(pl.col("year") == 2020).collect()
print(f"Test main data: {test_main.shape[0]} records for level1_code=G, year=2020")

# Check the NACE level 1 data we're joining with
test_nace_l1 = nace_level1_renamed.filter(pl.col("czso_code") == "G").filter(pl.col("year") == 2020).collect()
print(f"Test NACE Level 1 data: {test_nace_l1.shape[0]} records for czso_code=G, year=2020")

if test_nace_l1.shape[0] > 0:
    print("NACE Level 1 sample:")
    print(test_nace_l1.select(["czso_code", "year", "level1_nace_en_name", "sector_level1_avg_wages_by_nace"]))

# Do the join
test_join = pl.LazyFrame(test_main).join(
    nace_level1_renamed,
    left_on=["level1_code", "year"],
    right_on=["czso_code", "year"], 
    how="left"
).collect()

print(f"After join: {test_join.shape[0]} records")
print(f"Expected: {test_main.shape[0]} records (no duplication)")

if test_join.shape[0] != test_main.shape[0]:
    print(f"❌ DUPLICATION FOUND: {test_join.shape[0] / test_main.shape[0]:.1f}x expansion")
    
    # Check the join keys more carefully
    print("\\nAnalyzing duplication...")
    
    # Count unique combinations in both datasets
    main_combos = test_main[["level1_code", "year"]].n_unique()
    nace_combos = test_nace_l1[["czso_code", "year"]].n_unique() if test_nace_l1.shape[0] > 0 else 0
    
    print(f"Main unique (level1_code, year): {main_combos}")
    print(f"NACE unique (czso_code, year): {nace_combos}")
    
    # Show duplicate analysis
    if test_join.shape[0] > 0:
        firm_year_dupes = test_join.group_by(["firm_ico", "year"]).len().sort("len", descending=True)
        print("\\nTop firm-year duplicates:")
        print(firm_year_dupes.head())
        
        # Show the actual duplicated rows for investigation
        if firm_year_dupes.height > 0 and firm_year_dupes[0, "len"] > 1:
            dup_ico = firm_year_dupes[0, "firm_ico"]
            dup_year = firm_year_dupes[0, "year"]
            dup_rows = test_join.filter(pl.col("firm_ico") == dup_ico).filter(pl.col("year") == dup_year)
            print(f"\\nDuplicate rows for ico={dup_ico}, year={dup_year}:")
            print(dup_rows.select(["firm_ico", "year", "level1_code", "level1_nace_en_name"]))
else:
    print("✅ No duplication in Level 1 join")

=== MINIMAL JOIN TEST ===
Testing Level 1 join only...
Test main data: 6762 records for level1_code=G, year=2020
Test NACE Level 1 data: 1 records for czso_code=G, year=2020
NACE Level 1 sample:
shape: (1, 4)
┌───────────┬──────┬─────────────────────────────────┬─────────────────────────────────┐
│ czso_code ┆ year ┆ level1_nace_en_name             ┆ sector_level1_avg_wages_by_nac… │
│ ---       ┆ ---  ┆ ---                             ┆ ---                             │
│ str       ┆ i64  ┆ str                             ┆ f64                             │
╞═══════════╪══════╪═════════════════════════════════╪═════════════════════════════════╡
│ G         ┆ 2020 ┆ Wholesale and retail trade; re… ┆ 33482.0                         │
└───────────┴──────┴─────────────────────────────────┴─────────────────────────────────┘
After join: 6762 records
Expected: 6762 records (no duplication)
✅ No duplication in Level 1 join


In [64]:
# save the final merged DataFrame to Parquet
print("=== Saving final merged DataFrame ===")
final_df.write_parquet(output_path, compression="snappy")
print(f"Final merged DataFrame saved to {output_path}")

=== Saving final merged DataFrame ===
Final merged DataFrame saved to ../data/data_ready/merged_panel_imputed.parquet
