In [3]:
import pandas as pd
import numpy as np

import sys
import os
import runpy

In [5]:
os.chdir(r"C:\Users\jarem\OneDrive - London School of Economics\YEAR 2\PP4V8 - policy paper\policy-paper-repo")

In [14]:
pop98_path = r"data/inputs/1. matching_vars/pop/1998/pop_1998_gmina.csv"
edu98_path = r"data/inputs/1. matching_vars/pop/1998/edu_1998_gmina.csv"

pop98 = pd.read_csv(pop98_path, delimiter=';', dtype={'Kod': str})
pop98.rename(columns={'Kod':'gmina_code','Nazwa':'gmina_name','ogółem;1998;[osoba]':'tot_pop_1998'}, inplace=True)
# create working_pop as in your notebook (adjust column names if different)
pop98['20-29;1998;[osoba]'] = pop98.get('20-29;1998;[osoba]', 0)
pop98['30-39;1998;[osoba]'] = pop98.get('30-39;1998;[osoba]', 0)
pop98['40-49;1998;[osoba]'] = pop98.get('40-49;1998;[osoba]', 0)
pop98['50-59;1998;[osoba]'] = pop98.get('50-59;1998;[osoba]', 0)
pop98['working_pop_1998'] = pop98[['20-29;1998;[osoba]','30-39;1998;[osoba]','40-49;1998;[osoba]','50-59;1998;[osoba]']].sum(axis=1)

# strip codes and drop aggregates / national rows
pop98['gmina_code'] = pop98['gmina_code'].str.strip()
bad_mask = pop98['gmina_code'].isin(['0', '', None]) | pop98['gmina_name'].str.contains('POLSKA', case=False, na=False)
pop98 = pop98.loc[~bad_mask].copy()
pop98 = pop98.drop_duplicates(subset='gmina_code', keep='first')

edu98 = pd.read_csv(edu98_path, delimiter=';', dtype={'Kod': str})
edu98.rename(columns={'Kod':'gmina_code','Nazwa':'gmina_name','wyższe;1998;[osoba]':'higher_1998',
                      'średnie;1998;[osoba]':'secondary_1998',
                      'zasadnicze zawodowe;1998;[osoba]':'vocational_1998',
                      'podstawowe;1998;[osoba]':'primary_1998'}, inplace=True)
edu98['gmina_code'] = edu98['gmina_code'].str.strip()
edu98 = edu98.loc[~edu98['gmina_code'].isin(['0','',None])].drop_duplicates('gmina_code', keep='first')

# merge 1998 gmina-level pop + edu
df_full = pop98.merge(edu98.drop(columns=['gmina_name']), on='gmina_code', how='left', validate='1:1')

# --- 2002 population + education (gmina) ---
pop02_path = r"data/inputs/1. matching_vars/pop/2002/pop_age_2002_gmina.csv"
edu02_path = r"data/inputs/1. matching_vars/pop/2002/edu_age_2002_powiat.csv"  # adjust if wrong file

pop02 = pd.read_csv(pop02_path, delimiter=';', dtype={'Kod': str})
pop02.rename(columns={'Kod':'gmina_code','Nazwa':'gmina_name','ogółem;ogółem;2002;[osoba]':'tot_pop_2002',
                      'ogółem;w wieku produkcyjnym ogółem;2002;[osoba]':'working_pop_2002'}, inplace=True)
pop02['gmina_code'] = pop02['gmina_code'].str.strip()
pop02 = pop02.loc[~pop02['gmina_code'].isin(['0','',None])].drop_duplicates('gmina_code', keep='first')

pop_edu_02 = pd.read_csv(edu02_path, delimiter=';', dtype={'Kod': str})
# rename columns from your notebook mapping
pop_edu_02.rename(columns={'Kod':'gmina_code','Nazwa':'gmina_name',
                           'ogółem;wyższe;2002;[osoba]':'higher_2002',
                           'ogółem;średnie razem;2002;[osoba]':'secondary_2002',
                           'ogółem;podstawowe ukończone;2002;[osoba]':'primary_2002',
                           'ogółem;policealne;2002;[osoba]':'post_secondary_2002',
                           'ogółem;średnie zawodowe;2002;[osoba]':'secondary_vocational_2002',
                           'ogółem;zasadnicze zawodowe;2002;[osoba]':'vocational_2002',
                           'ogółem;średnie ogólnokształcące;2002;[osoba]':'secondary_general_2002'}, inplace=True)
pop_edu_02['gmina_code'] = pop_edu_02['gmina_code'].str.strip()
pop_edu_02 = pop_edu_02.loc[~pop_edu_02['gmina_code'].isin(['0','',None])].drop_duplicates('gmina_code', keep='first')

# merge 2002 gmina-level pop + edu
df_full_02 = pop02.merge(pop_edu_02.drop(columns=['gmina_name']), on='gmina_code', how='left', validate='1:1')

# --- Harmonize columns and stack 1998 & 2002 ---
df_98 = df_full.rename(columns={
    'tot_pop_1998':'tot_pop','working_pop_1998':'working_pop',
    'higher_1998':'higher','secondary_1998':'secondary',
    'vocational_1998':'vocational','primary_1998':'primary'
}).copy()
df_98['year'] = 1998

df_02 = df_full_02.rename(columns={
    'tot_pop_2002':'tot_pop','working_pop_2002':'working_pop',
    'higher_2002':'higher','post_secondary_2002':'post_secondary',
    'secondary_2002':'secondary','secondary_general_2002':'secondary_general',
    'secondary_vocational_2002':'secondary_vocational','vocational_2002':'vocational',
    'primary_2002':'primary'
}).copy()
df_02['year'] = 2002

common_cols = [
    'gmina_code','gmina_name','year',
    'tot_pop','working_pop',
    'higher','post_secondary','secondary',
    'secondary_general','secondary_vocational',
    'vocational','primary'
]

for col in common_cols:
    if col not in df_98.columns:
        df_98[col] = pd.NA
    if col not in df_02.columns:
        df_02[col] = pd.NA

df_98 = df_98[common_cols]
df_02 = df_02[common_cols]

df_harmonized = pd.concat([df_98, df_02], ignore_index=True)

# diagnostics
print("Rows after harmonization:", df_harmonized.shape)
print("Unique gmina_codes (any year):", df_harmonized['gmina_code'].nunique())
# display(df_harmonized.head(10))

# --- Compute 1998 -> 2002 changes (wide join) ---
id_cols = ['gmina_code','gmina_name']
value_cols = [c for c in df_harmonized.columns if c not in id_cols + ['year']]

df98 = df_harmonized[df_harmonized['year'] == 1998].set_index(id_cols)[value_cols].copy()
df02 = df_harmonized[df_harmonized['year'] == 2002].set_index(id_cols)[value_cols].copy()

# coerce numeric for change calc
df98 = df98.apply(pd.to_numeric, errors='coerce')
df02 = df02.apply(pd.to_numeric, errors='coerce')

df_join = df98.add_suffix('_1998').join(df02.add_suffix('_2002'), how='outer').reset_index()

# compute abs and pct changes
for col in value_cols:
    c98 = f"{col}_1998"
    c02 = f"{col}_2002"
    abscol = f"{col}_abs_change"
    pctcol = f"{col}_pct_change"
    df_join[abscol] = df_join[c02] - df_join[c98]
    df_join[pctcol] = np.where(
        df_join[c98].replace(0, np.nan).notna(),
        df_join[abscol] / df_join[c98],
        np.nan
    )

# # optional: keep only rows present in both years (e.g. tot_pop non-null in both)
# both_mask = df_join[['tot_pop_1998','tot_pop_2002']].notna().all(axis=1)
# df_changes = df_join.loc[both_mask].copy()

df_harmonized.to_csv(r"data/clean/matching/pop_harmonized_gmina_1998_2002.csv", index=False)
# df_changes.to_csv(r"data/clean/matching/gmina_changes_1998_2002.csv", index=False)

display(df_harmonized.head())

Rows after harmonization: (4405, 12)
Unique gmina_codes (any year): 4017


Unnamed: 0,gmina_code,gmina_name,year,tot_pop,working_pop,higher,post_secondary,secondary,secondary_general,secondary_vocational,vocational,primary
0,200000,DOLNOŚLĄSKIE,1998,2948212,1603071,146699,,577461,,,534634,814891
1,201000,Powiat bolesławiecki,1998,86359,47068,2649,,13847,,,16302,25419
2,201011,Bolesławiec (1),1998,43503,24764,2213,,9786,,,7924,11100
3,201022,Bolesławiec (2),1998,10340,5345,84,,882,,,2158,3363
4,201032,Gromadka (2),1998,5851,3066,72,,638,,,1062,1918


In [15]:
# df = pd.read_csv(r"data/clean/matching/gmina_changes_1998_2002.csv")