In [1]:
%load_ext IPython.extensions.autoreload
%autoreload 0

In [2]:
import numpy as np
import pandas as pd
import geopandas as gpd

import openpyxl

import sys
import os
import runpy

import requests
from io import StringIO
import json

import json
import glob
from pathlib import Path

import datetime
from dateutil import parser
import pickle
import re

## CREATE LOOKUP MASTER TABLE 

In [3]:
# PATHS
INPUT_EXCEL = r"C:\Users\jarem\OneDrive - London School of Economics\YEAR 2\1. Policy paper\policy-paper-repo\data\inputs\shapefiles\polska\teryt_klucz_powiaty_gminy_lata_1999_2025-1.xlsx"
OUTPUT_PICKLE = r"C:\Users\jarem\OneDrive - London School of Economics\YEAR 2\1. Policy paper\policy-paper-repo\data\outputs\teryt_lookup\teryt_lookup_2025.pkl"

print("Reading Excel file... (This may take a moment)")
# Load only necessary columns
cols = ['region', 'nazwa_powiatu', 'nazwa_gminy', 'teryt_2025', 'zmiana_opis']
df_g = pd.read_excel(INPUT_EXCEL, sheet_name='gminy', dtype=str, usecols=cols)

# --- 1. NORMALIZATION LOGIC ---
def normalize_series(s):
    return (s.astype(str).str.lower()
            .str.replace(r'\s+od\s+\d{4}', '', regex=True) # Remove ' od 2002'
            .str.replace('m.st.', '', regex=False)
            .str.replace('m.', '', regex=False)
            .str.replace('st.', '', regex=False)
            .str.replace('miasto', '', regex=False)
            .str.replace('powiat', '', regex=False)
            .str.replace('-', '', regex=False)
            .str.replace('.', '', regex=False)
            .str.replace(' ', '', regex=False))

print("Normalizing names...")
woj_id = df_g['region'].astype(str).str.split('.').str[0].str.zfill(2)
pow_norm = normalize_series(df_g['nazwa_powiatu'])
gmi_norm = normalize_series(df_g['nazwa_gminy'])
target_id = df_g['teryt_2025'].astype(str).str.split('.').str[0].str.zfill(7)

# --- 2. BUILD PRIMARY MAP (Current Names) ---
# Key: (WojID, PowiatNorm, GminaNorm) -> Value: TERYT25
primary_lookup = dict(zip(zip(woj_id, pow_norm, gmi_norm), target_id))

# --- 3. MINING HISTORY (Renames & Mergers) ---
print("Mining historical changes...")

# A. Renames
mask_rename = df_g['zmiana_opis'].str.contains('zmiana nazwy z', na=False, case=False)
if mask_rename.any():
    old_names = df_g.loc[mask_rename, 'zmiana_opis'].str.extract(r'zmiana nazwy z\s+(.+?)\s+na', flags=re.IGNORECASE)[0]
    old_names_norm = normalize_series(old_names)
    
    # Map Old Name -> Current ID
    hist_keys = zip(woj_id[mask_rename], pow_norm[mask_rename], old_names_norm)
    primary_lookup.update(dict(zip(hist_keys, target_id[mask_rename])))

# B. Absorptions (Mergers)
mask_absorb = df_g['zmiana_opis'].str.contains('włączenie gminy', na=False, case=False)
if mask_absorb.any():
    absorbed_names = df_g.loc[mask_absorb, 'zmiana_opis'].str.extract(r'włączenie gminy\s+(.+?)(?:\s|$)', flags=re.IGNORECASE)[0]
    absorbed_norm = normalize_series(absorbed_names)
    
    # Map Absorbed Name -> Survivor ID
    absorb_keys = zip(woj_id[mask_absorb], pow_norm[mask_absorb], absorbed_norm)
    primary_lookup.update(dict(zip(absorb_keys, target_id[mask_absorb])))

# --- 4. BUILD FALLBACK MAP (Woj + Gmina only) ---
# Useful for cities like "M. Łódź" where powiat matching often fails
df_g['woj_id'] = woj_id
df_g['gmi_norm'] = gmi_norm
df_g['target_id'] = target_id

# Only keep unique mappings (e.g. if 'Nowa Wieś' appears twice in a Voivodeship, drop it)
unique_counts = df_g.groupby(['woj_id', 'gmi_norm'])['target_id'].nunique()
valid_indices = unique_counts[unique_counts == 1].index

fallback_df = df_g.set_index(['woj_id', 'gmi_norm']).loc[valid_indices]
fallback_lookup = fallback_df['target_id'].to_dict()

# --- 5. SAVE TO DISK ---
print(f"Saving lookup tables to {OUTPUT_PICKLE}...")
with open(OUTPUT_PICKLE, 'wb') as f:
    pickle.dump((primary_lookup, fallback_lookup), f)

print("You can now load this file in your main script.")

Reading Excel file... (This may take a moment)
Normalizing names...
Mining historical changes...
Saving lookup tables to C:\Users\jarem\OneDrive - London School of Economics\YEAR 2\1. Policy paper\policy-paper-repo\data\outputs\teryt_lookup\teryt_lookup_2025.pkl...
You can now load this file in your main script.


## TERYT MATCHING FOR 2007 - 2013

#### Read in data and clean

IMPORTANT: dropped rows with NA for gmina or powiat

In [4]:
from GUS_processing_functions import date_columns, data_types, assign_geo_ids

In [5]:
umowy_200713_infered_distributed = pd.read_csv(r"C:\Users\jarem\OneDrive - London School of Economics\YEAR 2\1. Policy paper\policy-paper-repo\data\clean\treatment\eu_flows\intermediary\umowy_200713_infered_distributed.csv"
                                            , low_memory=False)

In [6]:
for col in date_columns:
    umowy_200713_infered_distributed[col] = pd.to_datetime(
        umowy_200713_infered_distributed[col],
        errors="coerce"   
    )

In [7]:
# umowy_200713_infered_distributed.head(), 
umowy_200713_infered_distributed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406771 entries, 0 to 406770
Data columns (total 28 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   ID                             406771 non-null  object        
 1   project_title                  406771 non-null  object        
 2   program                        406771 non-null  object        
 3   priority_code                  406771 non-null  object        
 4   action_code                    406771 non-null  object        
 5   subaction_code                 184778 non-null  object        
 6   voviodeship                    405914 non-null  object        
 7   powiat                         386364 non-null  object        
 8   gmina                          335971 non-null  object        
 9   total_value_PLN                406771 non-null  float64       
 10  Wydatki kwalifikowalne         406771 non-null  float64       
 11  

In [8]:
umowy_200713_infered_distributed.dropna(subset=["gmina"], inplace=True)
umowy_200713_infered_distributed.dropna(subset=["powiat"], inplace=True)

### Assing geo ids

In [9]:
# Path to the .pkl file you saved
PICKLE_PATH = r"C:\Users\jarem\OneDrive - London School of Economics\YEAR 2\1. Policy paper\policy-paper-repo\data\outputs\teryt_lookup\teryt_lookup_2025.pkl"

# Load the tuple (primary_lookup, fallback_lookup)
with open(PICKLE_PATH, 'rb') as f:
    primary_lookup, fallback_lookup = pickle.load(f)

# Optional: Verify it loaded correctly
print(f"Loaded Primary Map with {len(primary_lookup)} entries.")
print(f"Loaded Fallback Map with {len(fallback_lookup)} entries.")

Loaded Primary Map with 2340 entries.
Loaded Fallback Map with 2153 entries.


In [10]:
umowy_200713_assigned = assign_geo_ids(umowy_200713_infered_distributed)

In [15]:
umowy_200713_assigned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 335108 entries, 13 to 406770
Data columns (total 32 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   ID                             335108 non-null  object        
 1   project_title                  335108 non-null  object        
 2   program                        335108 non-null  object        
 3   priority_code                  335108 non-null  object        
 4   action_code                    335108 non-null  object        
 5   subaction_code                 135343 non-null  object        
 6   voviodeship                    335108 non-null  object        
 7   powiat                         335108 non-null  object        
 8   gmina                          335108 non-null  object        
 9   total_value_PLN                335108 non-null  float64       
 10  Wydatki kwalifikowalne         335108 non-null  float64       
 11  subs

In [12]:
umowy_200713_assigned.to_csv(r"C:\Users\jarem\OneDrive - London School of Economics\YEAR 2\1. Policy paper\policy-paper-repo\data\clean\treatment\eu_flows\intermediary\umowy_200713_assigned.csv", index=False)

### import shapefiles

In [None]:
powiaty_geoms = gpd.read_file(r"C:\Users\jarem\OneDrive - London School of Economics\YEAR 2\1. Policy paper\policy-paper-repo\data\inputs\shapefiles\polska\powiaty\powiaty.shp")
gminy_geoms = gpd.read_file(r"C:\Users\jarem\OneDrive - London School of Economics\YEAR 2\1. Policy paper\policy-paper-repo\data\inputs\shapefiles\polska\gminy\gminy.shp")

In [None]:
gminy_geoms

NameError: name 'gminy' is not defined