In [1]:
import pandas as pd 
import numpy as np
import geopandas as gpd
from shapely.geometry import Point

import sys
import os
import runpy

import requests
from io import StringIO
import json


In [2]:
os.chdir(r"C:\Users\jarem\OneDrive - London School of Economics\YEAR 2\PP4V8 - policy paper\policy-paper-repo")

# Preclean matching variables: pop

## Population 1998

### gmina

In [3]:
df = pd.read_csv(r'data/inputs/1. matching_vars/pop/1998/pop_1998_gmina.csv', delimiter=';')

In [4]:
df.rename(columns={
    'Kod' : 'gmina_code',
    'Nazwa' : 'gmina_name',
    'ogółem;1998;[osoba]' : 'tot_pop_1998',
    }, inplace=True)

In [5]:
df.shape

(4099, 8)

In [6]:
df['working_pop_1998'] = df['20-29;1998;[osoba]'] + df['30-39;1998;[osoba]'] + df['40-49;1998;[osoba]'] + df['50-59;1998;[osoba]']

In [7]:
df.drop(df.columns[3:8], axis=1, inplace=True)

In [474]:
# counts = df['gmina_code'].value_counts()
# unique_df = df[df['gmina_code'].map(counts) == 1]
# aggs_df = df[df['gmina_code'].map(counts) > 1]
# df = unique_df

In [8]:
display(df.head(3)) #, aggs_df.head(3))

Unnamed: 0,gmina_code,gmina_name,tot_pop_1998,working_pop_1998
0,0,POLSKA,37879105,20005136
1,0,POLSKA - gminy miejskie,19050104,10561917
2,0,POLSKA - gminy miejsko-wiejskie,8149273,4176804


In [9]:
# quick diagnostics
print(df.dtypes)
print(df['gmina_code'].value_counts().head(10))
display(df[df['gmina_code']==0].head())                  # shows aggregate rows if codes numeric 0
display(df[df['gmina_name'].str.contains('POLSKA', na=False)].head(10))

gmina_code           int64
gmina_name          object
tot_pop_1998         int64
working_pop_1998     int64
dtype: object
gmina_code
1200000    6
600000     6
0          6
200000     6
1000000    6
1400000    6
400000     6
800000     6
1600000    6
1800000    6
Name: count, dtype: int64


Unnamed: 0,gmina_code,gmina_name,tot_pop_1998,working_pop_1998
0,0,POLSKA,37879105,20005136
1,0,POLSKA - gminy miejskie,19050104,10561917
2,0,POLSKA - gminy miejsko-wiejskie,8149273,4176804
3,0,POLSKA - gminy wiejskie,10679728,5266415
4,0,POLSKA - miasta,23303773,12803061


Unnamed: 0,gmina_code,gmina_name,tot_pop_1998,working_pop_1998
0,0,POLSKA,37879105,20005136
1,0,POLSKA - gminy miejskie,19050104,10561917
2,0,POLSKA - gminy miejsko-wiejskie,8149273,4176804
3,0,POLSKA - gminy wiejskie,10679728,5266415
4,0,POLSKA - miasta,23303773,12803061
5,0,POLSKA - wieś,14575332,7202075


In [None]:
# read with Kod as string to preserve codes
df = pd.read_csv(r'data/inputs/1. matching_vars/pop/1998/pop_1998_gmina.csv',
                 delimiter=';', dtype={'Kod': str})
df.rename(columns={'Kod':'gmina_code','Nazwa':'gmina_name','ogółem;1998;[osoba]':'tot_pop_1998'}, inplace=True)
df['gmina_code'] = df['gmina_code'].str.strip()

mask_bad = df['gmina_code'].isin(['0', '', None]) | df['gmina_name'].str.contains('POLSKA', case=False, na=False)
df = df.loc[~mask_bad].copy()

dupes = df[df['gmina_code'].duplicated(keep=False)].sort_values('gmina_code')
display(dupes.head(5))
df = df.drop_duplicates(subset='gmina_code', keep='first')

## Education

In [11]:
edu = pd.read_csv(r'data/inputs/1. matching_vars/pop/1998/edu_1998_gmina.csv', delimiter=';', dtype={'Kod': str})

In [12]:
edu.rename(columns={
    'Kod' : 'gmina_code',
    'Nazwa' : 'gmina_name',
    'wyższe;1998;[osoba]' : 'higher_1998',
    'średnie;1998;[osoba]' : 'secondary_1998',
    'zasadnicze zawodowe;1998;[osoba]' : 'vocational_1998',
    'podstawowe;1998;[osoba]' : 'primary_1998'
    }, inplace=True)

edu['gmina_code']=edu['gmina_code'].str.strip()

In [13]:
edu = edu.loc[~edu['gmina_code'].isin(['0','',None])]

In [14]:
edu = edu.drop_duplicates(subset='gmina_code', keep='first')

In [15]:
df_full = df.merge(edu.drop(columns=['gmina_name']), on='gmina_code', how='left', validate='1:1')

In [None]:
# edu.drop(edu.columns[6:7], axis=1, inplace=True)

In [479]:
edu.head(3)

Unnamed: 0,gmina_code,gmina_name,higher_1998,secondary_1998,vocational_1998,primary_1998
0,0,POLSKA,1838360,6979678,6665843,10961500
1,0,POLSKA - gminy miejskie,1479007,4736406,3295480,4544384
2,0,POLSKA - gminy miejsko-wiejskie,213579,1211672,1501570,2565210


In [480]:
# counts = edu['gmina_code'].value_counts()
# unique_edu = edu[edu['gmina_code'].map(counts) == 1]
# aggs_edu = edu[edu['gmina_code'].map(counts) > 1]
# edu = unique_edu

## Concatinate

In [16]:
# edu.gmina_code.nunique(), df.gmina_code.nunique(), edu.gmina_code.shape

In [17]:
# display(edu.head(3), df.head(3))

In [18]:
# df_full = df.merge(edu.drop(columns=['gmina_name']), on= "gmina_code", copy=False)

In [19]:
df_full.head(3)

Unnamed: 0,gmina_code,gmina_name,tot_pop_1998,20-29;1998;[osoba],30-39;1998;[osoba],40-49;1998;[osoba],50-59;1998;[osoba],Unnamed: 7,higher_1998,secondary_1998,vocational_1998,primary_1998,Unnamed: 6
0,200000,DOLNOŚLĄSKIE,2948212,407798,555150,320144,319979,,146699,577461,534634,814891,
1,201000,Powiat bolesławiecki,86359,12501,16063,9067,9437,,2649,13847,16302,25419,
2,201011,Bolesławiec (1),43503,5597,8679,5364,5124,,2213,9786,7924,11100,


## 2002

In [486]:
pop_age_02 = pd.read_csv(r'data/inputs/1. matching_vars/pop/2002/pop_age_2002_gmina.csv', delimiter=';')

In [487]:
pop_age_02.rename(columns={
    'Kod' : 'gmina_code',
    'Nazwa' : 'gmina_name',
    'ogółem;ogółem;2002;[osoba]' : 'tot_pop_2002',
    'ogółem;w wieku produkcyjnym ogółem;2002;[osoba]' : 'working_pop_2002'
}, inplace=True)

In [488]:
pop_age_02.drop(pop_age_02.columns[4:5], axis=1, inplace=True)

In [20]:
# counts = pop_age_02['gmina_code'].value_counts()
# unique_pop = pop_age_02[pop_age_02['gmina_code'].map(counts) == 1]
# aggs_pop = pop_age_02[pop_age_02['gmina_code'].map(counts) > 1]
# pop_age_02 = unique_pop

In [490]:
display(pop_age_02.head(3), aggs_pop.head(3))

Unnamed: 0,gmina_code,gmina_name,tot_pop_2002,working_pop_2002
12,201000,Powiat bolesławiecki,88100,54675
13,202000,Powiat dzierżoniowski,106961,65784
14,203000,Powiat głogowski,88098,58608


Unnamed: 0,gmina_code,gmina_name,tot_pop_2002,working_pop_2002
0,0,POLSKA,38230080,23625719
1,0,POLSKA - gminy miejskie,19139633,12352874
2,0,POLSKA - gminy miejsko-wiejskie,8391587,5097198


### Education

In [491]:
pop_edu_02 = pd.read_csv(r'data/inputs/1. matching_vars/pop/2002/edu_age_2002_powiat.csv', delimiter=';')

In [492]:
pop_edu_02.drop(pop_edu_02.columns[9:], axis=1, inplace=True)

In [493]:
# pop_edu_02.columns.tolist()

In [494]:
pop_edu_02.rename(columns={
    'Kod' : 'gmina_code',
    'Nazwa' : 'gmina_name',
    'ogółem;wyższe;2002;[osoba]' : 'higher_2002',
    'ogółem;średnie razem;2002;[osoba]' : 'secondary_2002',
    'ogółem;podstawowe ukończone;2002;[osoba]' : 'primary_2002',
    'ogółem;podstawowe nieukończone i bez wykształcenia;2002;[osoba]' : 'no_edu_2002',
    'ogółem;policealne;2002;[osoba]' : 'post_secondary_2002',
    'ogółem;średnie zawodowe;2002;[osoba]' : 'secondary_vocational_2002',
    'ogółem;zasadnicze zawodowe;2002;[osoba]' : 'vocational_2002',
    'ogółem;średnie ogólnokształcące;2002;[osoba]' : 'secondary_general_2002'
}, inplace=True)

In [495]:
pop_edu_02['vocational_tot_2002'] = pop_edu_02['vocational_2002'] + pop_edu_02['secondary_vocational_2002']

In [496]:
pop_edu_02.head()

Unnamed: 0,gmina_code,gmina_name,higher_2002,post_secondary_2002,secondary_2002,secondary_general_2002,secondary_vocational_2002,vocational_2002,primary_2002,vocational_tot_2002
0,0,POLSKA,3203566,1023894,9184496,2802025,6382471,7539786,9651858,13922257
1,200000,DOLNOŚLĄSKIE,248420,85265,748071,224452,523619,583721,718362,1107340
2,201000,Powiat bolesławiecki,4869,2201,19311,4509,14802,20599,24341,35401
3,202000,Powiat dzierżoniowski,5811,2923,26347,7429,18918,20904,30778,39822
4,203000,Powiat głogowski,6398,2526,22448,6383,16065,20378,19805,36443


### Concatinate

In [497]:
df_full_02 = pop_age_02.merge(pop_edu_02.drop(columns=['gmina_name']), on= "gmina_code", copy=False)

In [498]:
df_full_02.columns

Index(['gmina_code', 'gmina_name', 'tot_pop_2002', 'working_pop_2002',
       'higher_2002', 'post_secondary_2002', 'secondary_2002',
       'secondary_general_2002', 'secondary_vocational_2002',
       'vocational_2002', 'primary_2002', 'vocational_tot_2002'],
      dtype='object')

In [499]:
df_full.columns

Index(['gmina_code', 'gmina_name', 'tot_pop_1998', 'working_pop_1998',
       'higher_1998', 'secondary_1998', 'vocational_1998', 'primary_1998'],
      dtype='object')

### Harmonize and merge 1998 and 2002

In [500]:
df_98 = df_full.rename(columns={
    'tot_pop_1998': 'tot_pop',
    'working_pop_1998': 'working_pop',
    'higher_1998': 'higher',
    'secondary_1998': 'secondary',
    'vocational_1998': 'vocational',
    'primary_1998': 'primary'
})

df_98['year'] = 1998


In [501]:
df_02 = df_full_02.rename(columns={
    'tot_pop_2002': 'tot_pop',
    'working_pop_2002': 'working_pop',
    'higher_2002': 'higher',
    'post_secondary_2002': 'post_secondary',
    'secondary_2002': 'secondary',
    'secondary_general_2002': 'secondary_general',
    'secondary_vocational_2002': 'secondary_vocational',
    'vocational_2002': 'vocational',
    'primary_2002': 'primary'
})

df_02['year'] = 2002


In [502]:
common_cols = [
    'gmina_code', 'gmina_name', 'year',
    'tot_pop', 'working_pop',
    'higher', 'post_secondary', 'secondary',
    'secondary_general', 'secondary_vocational',
    'vocational', 'primary'
]

for col in common_cols:
    if col not in df_98.columns:
        df_98[col] = pd.NA
    if col not in df_02.columns:
        df_02[col] = pd.NA

df_98 = df_98[common_cols]
df_02 = df_02[common_cols]


In [503]:
df_harmonized = pd.concat([df_98, df_02], ignore_index=True)


In [504]:
# df_harmonized.to_csv(r'data/clean/matching/pop_harmonized_gmina_1998_2002.csv', index=False)

In [505]:
df_harmonized.head()

Unnamed: 0,gmina_code,gmina_name,year,tot_pop,working_pop,higher,post_secondary,secondary,secondary_general,secondary_vocational,vocational,primary
0,0,POLSKA,1998,37879105,20005136,1838360,,6979678,,,6665843,10961500
1,0,POLSKA,1998,37879105,20005136,1479007,,4736406,,,3295480,4544384
2,0,POLSKA,1998,37879105,20005136,213579,,1211672,,,1501570,2565210
3,0,POLSKA,1998,37879105,20005136,145774,,1031600,,,1868793,3851906
4,0,POLSKA,1998,37879105,20005136,1645486,,5587173,,,4077363,5690422


## Employment

In [506]:
employ = pd.read_csv(r'data/inputs/1. matching_vars/employment/employ_2002_gmina.csv', delimiter=';')

In [507]:
employ.rename(columns={
    'Kod' : 'gmina_code',
    'Nazwa' : 'gmina_name',
    'ogółem;ogółem;ogółem;2002;[osoba]' : 'tot_pop_2002',
    'ogółem;ogółem;aktywni zawodowo ogółem;2002;[osoba]' : 'employed_2002',
    'ogółem;ogółem;aktywni zawodowo;2002;[osoba]' : 'active_2002',
    'ogółem;ogółem;aktywni zawodowo pracujący;2002;[osoba]' : 'active_employed_2002',
    'ogółem;ogółem;aktywni zawodowo bezrobotni;2002;[osoba]' : 'active_unemployed_2002',
    'ogółem;ogółem;bierni zawodowo;2002;[osoba]' : 'inactive_2002',
    'ogółem;w wieku produkcyjnym;ogółem;2002;[osoba]' : 'working_age_2002',
    'ogółem;w wieku produkcyjnym;aktywni zawodowo ogółem;2002;[osoba]' : 'working_age_active_2002',
    'ogółem;w wieku produkcyjnym;aktywni zawodowo pracujący;2002;[osoba]' : 'working_age_active_employed_2002',
    'ogółem;w wieku produkcyjnym;aktywni zawodowo bezrobotni;2002;[osoba]' : 'working_age_active_unemployed_2002',
    'ogółem;w wieku produkcyjnym;bierni zawodowo;2002;[osoba]' : 'working_age_inactive_2002'
}, inplace=True
)

In [508]:
employ.drop(columns=['Unnamed: 12'], inplace=True)

In [509]:
employ.head(3)

Unnamed: 0,gmina_code,gmina_name,tot_pop_2002,employed_2002,active_employed_2002,active_unemployed_2002,inactive_2002,working_age_2002,working_age_active_2002,working_age_active_employed_2002,working_age_active_unemployed_2002,working_age_inactive_2002
0,0,POLSKA,31288428,16776498,13218344,3558154,13456155,23625719,16247608,12718397,3529211,6426567
1,0,POLSKA - gminy miejskie,16133311,8476397,6659888,1816509,6968542,12352874,8303275,6505052,1798223,3426664
2,0,POLSKA - gminy miejsko-wiejskie,6730780,3649951,2794235,855716,2910531,5097198,3541481,2690751,850730,1402673


In [510]:
# counts = employ['gmina_code'].value_counts()
# unique_emp = employ[employ['gmina_code'].map(counts) == 1]
# aggs_pop = employ[employ['gmina_code'].map(counts) > 1]
# employ = unique_emp

In [511]:
df_harmonized.gmina_code.nunique()

4022

In [512]:
df_harmonized

Unnamed: 0,gmina_code,gmina_name,year,tot_pop,working_pop,higher,post_secondary,secondary,secondary_general,secondary_vocational,vocational,primary
0,0,POLSKA,1998,37879105,20005136,1838360,,6979678,,,6665843,10961500
1,0,POLSKA,1998,37879105,20005136,1479007,,4736406,,,3295480,4544384
2,0,POLSKA,1998,37879105,20005136,213579,,1211672,,,1501570,2565210
3,0,POLSKA,1998,37879105,20005136,145774,,1031600,,,1868793,3851906
4,0,POLSKA,1998,37879105,20005136,1645486,,5587173,,,4077363,5690422
...,...,...,...,...,...,...,...,...,...,...,...,...
4984,3217000,Powiat wałecki,2002,55254,34304,3330,1612,13063,3884,9179,10272,14865
4985,3218000,Powiat łobeski,2002,38583,23121,1488,1014,6559,2852,3707,8139,12770
4986,3261000,Powiat m. Koszalin,2002,108709,71738,15989,2729,35019,13311,21708,16026,20087
4987,3262000,Powiat m. Szczecin,2002,415399,272325,59654,12656,128047,51439,76608,62629,78387


In [513]:
df_harmonized.shape

(4989, 12)

In [517]:
df_harmonized.head(10)

Unnamed: 0,gmina_code,gmina_name,year,tot_pop,working_pop,higher,post_secondary,secondary,secondary_general,secondary_vocational,vocational,primary
0,0,POLSKA,1998,37879105,20005136,1838360,,6979678,,,6665843,10961500
1,0,POLSKA,1998,37879105,20005136,1479007,,4736406,,,3295480,4544384
2,0,POLSKA,1998,37879105,20005136,213579,,1211672,,,1501570,2565210
3,0,POLSKA,1998,37879105,20005136,145774,,1031600,,,1868793,3851906
4,0,POLSKA,1998,37879105,20005136,1645486,,5587173,,,4077363,5690422
5,0,POLSKA,1998,37879105,20005136,192874,,1392505,,,2588480,5271078
6,0,POLSKA - gminy miejskie,1998,19050104,10561917,1838360,,6979678,,,6665843,10961500
7,0,POLSKA - gminy miejskie,1998,19050104,10561917,1479007,,4736406,,,3295480,4544384
8,0,POLSKA - gminy miejskie,1998,19050104,10561917,213579,,1211672,,,1501570,2565210
9,0,POLSKA - gminy miejskie,1998,19050104,10561917,145774,,1031600,,,1868793,3851906


In [None]:
df_harmonized.merge(employ.drop(columns=['gmina_name']), on='gmina_code', copy=False)

## Compute changes and export data

In [None]:

# ID cols and value cols (adjust if you dropped/renamed others)
id_cols = ['gmina_code', 'gmina_name']
value_cols = [c for c in df_harmonized.columns if c not in id_cols + ['year']]

# Split years
df98 = df_harmonized[df_harmonized['year'] == 1998].set_index(id_cols)
df02 = df_harmonized[df_harmonized['year'] == 2002].set_index(id_cols)

# Join side-by-side
df_join = df98[value_cols].rename(columns=lambda c: f"{c}_1998").join(
          df02[value_cols].rename(columns=lambda c: f"{c}_2002"),
          how='outer'
).reset_index()

# Compute absolute and percent changes for each value column
for col in value_cols:
    c98 = f"{col}_1998"
    c02 = f"{col}_2002"
    abscol = f"{col}_abs_change"
    pctcol = f"{col}_pct_change"
    df_join[abscol] = df_join[c02] - df_join[c98]
    # percent change relative to 1998 (handle zero/NaN)
    df_join[pctcol] = np.where(
        df_join[c98].replace(0, np.nan).notna(),
        df_join[abscol] / df_join[c98],
        np.nan
    )

# Optional: keep only gminas observed in both years
both_years_mask = df_join[[f"{value_cols[0]}_1998", f"{value_cols[0]}_2002"]].notna().all(axis=1)
df_changes = df_join.loc[both_years_mask].copy()

# Inspect and save
display(df_changes.head())
df_changes.to_csv(r"data/clean/matching/gmina_changes_1998_2002.csv", index=False)