In [1]:
# %load_ext IPython.extensions.autoreload
%reload_ext IPython.extensions.autoreload

In [2]:
import numpy as np
import pandas as pd

import openpyxl

import sys
import os
import runpy

import requests
from io import StringIO
import json

import json
import glob
from pathlib import Path

In [3]:
os.chdir('../..')
os.getcwd()


'c:\\Users\\jarem\\OneDrive - London School of Economics\\YEAR 2\\PP4V8 - policy paper\\policy-paper-repo'

# Operationalisation of variables

## 1. Eligible regions (NUTS)

In [4]:
nuts_eligible_14_20 = pd.read_excel(r"data\inputs\cohesion_data_europa\Cohesion_policy_indicators_2024.xlsx", sheet_name="M3")
nuts_eligible_14_20 = nuts_eligible_14_20[["NUTS", "VALUE"]]

In [5]:
iso3_eligible_14_20 = pd.read_excel(r"data\inputs\cohesion_data_europa\Cohesion_policy_indicators_2024.xlsx", sheet_name="M4")
iso3_eligible_14_20 = iso3_eligible_14_20[["NUTS", "VALUE"]]

In [6]:
nuts_eligible_21_27 = pd.read_excel(r"data\inputs\cohesion_data_europa\Cohesion_policy_indicators_2024.xlsx", sheet_name="M1")
nuts_eligible_21_27 = nuts_eligible_21_27[["NUTS", "VALUE"]]

In [7]:
iso3_eligible_21_27 = pd.read_excel(r"data\inputs\cohesion_data_europa\Cohesion_policy_indicators_2024.xlsx", sheet_name="M2")
iso3_eligible_21_27 = iso3_eligible_21_27[["NUTS", "VALUE"]]

In [8]:
nuts_eligibility_dict = {
    1 : "Less developed regions" , 
    2 : "Transition regions", 
    3 : "More developed regions"   
}

iso3_eligibility_dict = {
    1 : "Suppoerted" , 
    2 : "Supported on a transitional and specific basis", 
    3 : "Not supported"   
}

In [9]:
# sum(nuts_eligible_21_27.VALUE == 2 )

## Dane GUS

### Programme period: 2014 - 2020

In [10]:
PL_projects_14_20= pd.read_excel(r"data\inputs\dane_gus\lista_projektow_UE_2014_20\Lista_projektow_FE_2014_2020_02112025.xlsx", header = 2)

#### Clean up data

In [45]:
from src.functions import *

In [None]:
PL_projects_14_20.drop(columns=[
    "Forma finansowania/ Form of finance",
    "Projekt realizowany w ramach terytorialnych mechanizmów wdrażania/ Project implemented under territorial delivery mechanisms",
    "Działanie/ Measure",
    "Poddziałanie/ Submeasure",
    "Dziedzina działalności gospodarczej, której dotyczy projekt/ Area of economic activity",
    "Obszar wsparcia projektu/ Area of project intervention",
    "Cel projektu/ Project thematic objective",
    "Cel uzupełniający dla projektów EFS/ ESF secondary theme"
], inplace=True)
PL_projects_14_20 = clean_english_colnames(PL_projects_14_20, normalize='keep')   
PL_projects_14_20 = PL_projects_14_20[PL_projects_14_20['Fund'] != 'BAR'] 


In [None]:
PL_projects_14_20['Amount of EU co-financing (EUR)'] = PL_projects_14_20['Amount of EU co-financing (PLN, for ETC projects EUR)']/4.4975 # this exchange rate was taken from the report


In [None]:
# PL_projects_14_20.columns

In [None]:
df_locations = unnest_locations(PL_projects_14_20, 'Project location', 'Amount of EU co-financing (PLN, for ETC projects EUR)','Amount of EU co-financing (EUR)')

print(f"Original rows: {len(PL_projects_14_20)}")
print(f"Rows after unnesting: {len(df_locations)}")


Original rows: 103189
Rows after unnesting: 143327


In [None]:
# add country marker
df_locations = add_iso3_column(df_locations)

In [None]:
# display(df_locations[['Contract number','Project location', 'woj', 'pow', 'funding_split']].head(10))
# display(df_locations.head(3))

In [None]:
# save output
df_locations.to_csv(r"data\outputs\eu_flows\pol_1420_projects_by_location.csv", index=False)

In [None]:
# df_locations["Project location"].unique().tolist()

#### Reshape df so that powiats are row indexes

panel structure with treatment timing

In [19]:
df_locations = df_locations[df_locations['iso3'] == 'POL']

In [20]:
# Convert dates to datetime
df_locations['Project start date'] = pd.to_datetime(df_locations['Project start date'], errors='coerce')
df_locations['Project end date'] = pd.to_datetime(df_locations['Project end date'], errors='coerce')

# Extract year of first project (treatment year)
df_locations['year_start'] = df_locations['Project start date'].dt.year

# Create treatment variable: year when powiat first received EU funding
df_treatment_timing = df_locations.groupby('pow').agg({
    'year_start': 'min',  # first year treated
    'funding_split': 'sum',
    'Contract number': 'nunique'
}).reset_index()
df_treatment_timing.columns = ['pow', 'treatment_year', 'total_funding', 'num_projects']

# Create binary treatment indicator for each year
# For staggered DiD: treated = 1 if year >= treatment_year for that powiat
years = range(df_locations['year_start'].min(), df_locations['year_start'].max() + 1)
treatment_data = []

for pow in df_treatment_timing['pow'].unique():
    treat_year = df_treatment_timing[df_treatment_timing['pow'] == pow]['treatment_year'].values[0]
    for year in years:
        treatment_data.append({
            'pow': pow,
            'year': year,
            'treated': 1 if year >= treat_year else 0,
            'time_to_treatment': year - treat_year if year >= treat_year else None
        })

df_staggered_did = pd.DataFrame(treatment_data)

# Add cumulative funding received by that year (for intensity)
df_funding_by_year = df_locations.groupby(['pow', 'year_start'])['funding_split'].sum().reset_index()
df_funding_by_year.columns = ['pow', 'year', 'funding_that_year']
df_funding_by_year['funding_cum'] = df_funding_by_year.groupby('pow')['funding_that_year'].cumsum()

df_staggered_did = df_staggered_did.merge(df_funding_by_year[['pow', 'year', 'funding_cum']], 
                                            on=['pow', 'year'], how='left')
df_staggered_did['funding_cum'] = df_staggered_did['funding_cum'].fillna(0)



In [21]:
print("Staggered DiD treatment structure:")
display(df_staggered_did.head(10))

# Summary: which powiats treated when?
display(df_treatment_timing.sort_values('treatment_year').head(10))



Staggered DiD treatment structure:


Unnamed: 0,pow,year,treated,time_to_treatment,funding_cum
0,Biała Podlaska,2014,1,0.0,26464630.0
1,Biała Podlaska,2015,1,1.0,34478040.0
2,Biała Podlaska,2016,1,2.0,109188600.0
3,Biała Podlaska,2017,1,3.0,146940200.0
4,Biała Podlaska,2018,1,4.0,211658100.0
5,Biała Podlaska,2019,1,5.0,233971800.0
6,Biała Podlaska,2020,1,6.0,298719500.0
7,Biała Podlaska,2021,1,7.0,316112800.0
8,Biała Podlaska,2022,1,8.0,319843900.0
9,Biała Podlaska,2023,1,9.0,321936600.0


Unnamed: 0,pow,treatment_year,total_funding,num_projects
394,żuromiński,2014,173045400.0,154
369,Łódź,2014,7612179000.0,2551
370,łaski,2014,190570200.0,202
371,łańcucki,2014,449491900.0,260
372,łobeski,2014,226623000.0,134
373,łomżyński,2014,453412100.0,330
27,Klaipedos rajono savivaldybe,2014,2267455.0,23
375,łowicki,2014,393427900.0,301
376,łukowski,2014,553387000.0,516
377,łódzki wschodni,2014,455230000.0,534


In [22]:
df_treatment_timing.treatment_year.value_counts().sort_index()

treatment_year
2014    360
2015     27
2016      8
2017      2
2022      1
Name: count, dtype: int64

In [23]:
# Save
df_staggered_did.to_parquet(r"data\outputs\eu_flows\pol_1420_treatment_staggered_did.parquet", index=False)
df_treatment_timing.to_parquet(r"data\outputs\eu_flows\pol_1420_treatment_timing.parquet", index=False)

### Programme period 2021-2027

In [25]:
PL_projects_21_27 = pd.read_excel(r"data\inputs\dane_gus\lista_projektow_UE_2021_27\Lista_projektow_FE_2021_2027_02112025.xlsx", header = 1)

#### Clean up 

In [26]:
from src.functions import *

In [27]:
PL_projects_21_27.drop(columns=[
    "Działanie/ Measure",
    # "Cel szczegółowy/ Specific objective",
    "Priorytet/ Priority", 
    "Działanie/ Measure",
    "Numer identyfikacyjny statku w rejestrze floty rybackiej UE/ The Union fishing fleet register identification number",
    "Nazwa wykonawcy kontraktu/ The contractor’s name",
    "Kategoria wsparcia/ Type of intervention"
], inplace=True)

PL_projects_21_27 = clean_english_colnames(PL_projects_21_27, normalize='keep')   
PL_projects_21_27 = PL_projects_21_27[PL_projects_21_27['Fund'] != 'BAR'] 

In [28]:
PL_projects_21_27["EU co-financing (EUR)"] = PL_projects_21_27["EU co-financing (PLN)"]/4.4975 # this exchange rate was taken from the report

In [29]:
# PL_projects_21_27.head()

In [30]:
PL_projects_21_27.columns

Index(['Project name', 'Project summary', 'Contract number',
       'Beneficiary name', 'Fund', 'Specific objective', 'Programme',
       'Total project value (PLN)', 'Union co-financing rate (%)',
       'EU co-financing (PLN)', 'EURO exchange rate', 'Project location',
       'Project start date', 'Project end date', 'EU co-financing (EUR)'],
      dtype='object')

In [31]:
df_locations = unnest_locations_with_gmina(PL_projects_21_27, 'Project location', 'EU co-financing (PLN)','EU co-financing (EUR)')

print(f"Original rows: {len(PL_projects_21_27)}")
print(f"Rows after unnesting: {len(df_locations)}")

Original rows: 23826
Rows after unnesting: 50011


In [32]:
# save output
df_locations.to_csv(r"data\outputs\eu_flows\pol_21_27_projects_by_location.csv", index=False)

In [33]:
df_locations = add_iso3_column(df_locations)
# df_locations = df_locations[df_locations['iso3'] == 'POL']


In [35]:
# df_locations["pow"].unique()

In [38]:
df_locations.head(3)

Unnamed: 0,Project name,Project summary,Contract number,Beneficiary name,Fund,Specific objective,Programme,Total project value (PLN),Union co-financing rate (%),EU co-financing (PLN),EURO exchange rate,Project location,Project start date,Project end date,EU co-financing (EUR),woj,pow,gmina,funding_split,iso3
0,Poprawa warunków recepcyjnych dla osób ubiegaj...,Polski system recepcyjny musi być przygotowany...,FAMI.01.01-IZ.00-0001/24,Szef Urzędu do Spraw Cudzoziemców,FAMI,FAMI.1 Wzmacnianie i rozwijanie wszystkich asp...,"Fundusz Azylu, Migracji i Integracji",41273504.53,80.0,33018803.62,,"WOJ: LUBELSKIE, POW: Biała Podlaska, GM: Biała...",2024-10-01 23:59:59,2028-12-31 23:59:59,7341591.0,LUBELSKIE,Biała Podlaska,Biała Podlaska,16509401.81,POL
1,Poprawa warunków recepcyjnych dla osób ubiegaj...,Polski system recepcyjny musi być przygotowany...,FAMI.01.01-IZ.00-0001/24,Szef Urzędu do Spraw Cudzoziemców,FAMI,FAMI.1 Wzmacnianie i rozwijanie wszystkich asp...,"Fundusz Azylu, Migracji i Integracji",41273504.53,80.0,33018803.62,,"WOJ: PODLASKIE, POW: zambrowski, GM: Zambrów",2024-10-01 23:59:59,2028-12-31 23:59:59,7341591.0,PODLASKIE,zambrowski,Zambrów,16509401.81,POL
2,Bezpieczna Przystań - Centrum Pomocy Kobietom ...,Szef Urzędu do Spraw Cudzoziemców (UdSC) jest ...,FAMI.01.01-IZ.00-0001/25,Szef Urzędu do Spraw Cudzoziemców,FAMI,FAMI.1 Wzmacnianie i rozwijanie wszystkich asp...,"Fundusz Azylu, Migracji i Integracji",67327046.73,75.0,50495285.04,,"WOJ: MAZOWIECKIE, POW: legionowski, GM: Serock",2025-05-01 23:59:59,2029-12-31 23:59:59,11227410.0,MAZOWIECKIE,legionowski,Serock,50495285.04,POL


#### Reshape df so that powiats are row indexes

panel structure with treatment timing

In [39]:
# Convert dates to datetime
df_locations['Project start date'] = pd.to_datetime(df_locations['Project start date'], errors='coerce')
df_locations['Project end date'] = pd.to_datetime(df_locations['Project end date'], errors='coerce')

# Extract year of first project (treatment year)
df_locations['year_start'] = df_locations['Project start date'].dt.year

# --- Powiat-level treatment timing (kept for reference) ---
df_treatment_timing = df_locations.groupby('pow').agg({
    'year_start': 'min',    # first year treated
    'funding_split': 'sum',
    'Contract number': 'nunique'
}).reset_index()
df_treatment_timing.columns = ['pow', 'treatment_year', 'total_funding', 'num_projects']

# --- Gmina-level treatment timing ---
# Group by pow + gmina (gmina may be None for some rows)
df_treatment_timing_gmina = df_locations.groupby(['pow', 'gmina']).agg({
    'year_start': 'min',
    'funding_split': 'sum',
    'Contract number': 'nunique'
}).reset_index()
df_treatment_timing_gmina.columns = ['pow', 'gmina', 'treatment_year', 'total_funding', 'num_projects']

# Create binary treatment indicator for each year at gmina level
years = range(int(df_locations['year_start'].min()), int(df_locations['year_start'].max()) + 1)
treatment_data = []
for _, row in df_treatment_timing_gmina.iterrows():
    pow_ = row['pow']
    gmina_ = row['gmina']
    treat_year = int(row['treatment_year']) if pd.notna(row['treatment_year']) else None
    for year in years:
        treated = 1 if (treat_year is not None and year >= treat_year) else 0
        ttt = (year - treat_year) if (treat_year is not None and year >= treat_year) else None
        treatment_data.append({
            'pow': pow_,
            'gmina': gmina_,
            'year': year,
            'treated': treated,
            'time_to_treatment': ttt
        })

df_staggered_did_gmina = pd.DataFrame(treatment_data)

# Add cumulative funding received by that year at gmina level (intensity)
df_funding_by_year_gmina = df_locations.groupby(['pow', 'gmina', 'year_start'])['funding_split'].sum().reset_index()
df_funding_by_year_gmina.columns = ['pow', 'gmina', 'year', 'funding_that_year']
df_funding_by_year_gmina['funding_cum'] = df_funding_by_year_gmina.groupby(['pow', 'gmina'])['funding_that_year'].cumsum()

df_staggered_did_gmina = df_staggered_did_gmina.merge(
    df_funding_by_year_gmina[['pow', 'gmina', 'year', 'funding_cum']],
    on=['pow', 'gmina', 'year'],
    how='left'
)
df_staggered_did_gmina['funding_cum'] = df_staggered_did_gmina['funding_cum'].fillna(0)

# Quick checks / outputs
print("Gmina-level staggered DiD (first rows):")
display(df_staggered_did_gmina.head(10))

print("Powiat-level treatment timing (sample):")
display(df_treatment_timing.sort_values('treatment_year').head(10))

Gmina-level staggered DiD (first rows):


Unnamed: 0,pow,gmina,year,treated,time_to_treatment,funding_cum
0,Biała Podlaska,Biała Podlaska,2014,0,,0.0
1,Biała Podlaska,Biała Podlaska,2015,0,,0.0
2,Biała Podlaska,Biała Podlaska,2016,0,,0.0
3,Biała Podlaska,Biała Podlaska,2017,0,,0.0
4,Biała Podlaska,Biała Podlaska,2018,0,,0.0
5,Biała Podlaska,Biała Podlaska,2019,0,,0.0
6,Biała Podlaska,Biała Podlaska,2020,0,,0.0
7,Biała Podlaska,Biała Podlaska,2021,1,0.0,142284300.0
8,Biała Podlaska,Biała Podlaska,2022,1,1.0,184032500.0
9,Biała Podlaska,Biała Podlaska,2023,1,2.0,216906500.0


Powiat-level treatment timing (sample):


Unnamed: 0,pow,treatment_year,total_funding,num_projects
395,łomżyński,2014,348094300.0,44
41,Kraków,2014,3375614000.0,370
283,poznański,2014,816589000.0,163
374,węgrowski,2014,105750400.0,72
270,oświęcimski,2014,610266000.0,152
380,zawierciański,2014,292315000.0,155
235,miński,2014,144720900.0,100
197,krakowski,2014,812903800.0,198
313,siedlecki,2014,966368500.0,102
366,wrocławski,2014,419898500.0,93


In [40]:
# Save outputs
df_staggered_did_gmina.to_parquet(r"data\outputs\eu_flows\pol_2127_treatment_staggered_did_gmina.parquet", index=False)
df_treatment_timing_gmina.to_parquet(r"data\outputs\eu_flows\pol_2127_treatment_timing_gmina.parquet", index=False)
df_treatment_timing.to_parquet(r"data\outputs\eu_flows\pol_2127_treatment_timing_pow.parquet", index=False)

In [15]:
display(df_locations[['Contract number','Project location', 'woj', 'pow', 'funding_split']].head(10))
# display(df_locations.head(3))

Unnamed: 0,Contract number,Project location,woj,pow,funding_split
0,FAMI.01.01-IZ.00-0001/24,"WOJ: LUBELSKIE, POW: Biała Podlaska, GM: Biała...",LUBELSKIE,Biała Podlaska,16509400.0
1,FAMI.01.01-IZ.00-0001/24,"WOJ: PODLASKIE, POW: zambrowski, GM: Zambrów",PODLASKIE,zambrowski,16509400.0
2,FAMI.01.01-IZ.00-0001/25,"WOJ: MAZOWIECKIE, POW: legionowski, GM: Serock",MAZOWIECKIE,legionowski,50495290.0
6,FAMI.02.01-IZ.00-0001/25,"WOJ: MAŁOPOLSKIE, POW: nowosądecki",MAŁOPOLSKIE,nowosądecki,900000.0
12,FAMI.02.01-IZ.00-0007/24,"WOJ: PODLASKIE, POW: Białystok, GM: Białystok",PODLASKIE,Białystok,3388984.0
18,FAMI.02.01-IZ.00-0013/24,"WOJ: MAZOWIECKIE, POW: Warszawa, GM: Warszawa",MAZOWIECKIE,Warszawa,27089000.0
29,FAMI.02.01-IZ.00-0025/24,"WOJ: DOLNOŚLĄSKIE, POW: Jelenia Góra, GM: Jele...",DOLNOŚLĄSKIE,Jelenia Góra,9755579.0
30,FAMI.02.01-IZ.00-0025/24,"WOJ: DOLNOŚLĄSKIE, POW: Legnica, GM: Legnica",DOLNOŚLĄSKIE,Legnica,9755579.0
31,FAMI.02.01-IZ.00-0025/24,"WOJ: DOLNOŚLĄSKIE, POW: Wałbrzych, GM: Wałbrzych",DOLNOŚLĄSKIE,Wałbrzych,9755579.0
32,FAMI.02.01-IZ.00-0025/24,"WOJ: DOLNOŚLĄSKIE, POW: Wrocław, GM: Wrocław",DOLNOŚLĄSKIE,Wrocław,9755579.0


In [19]:
df_staggered_did_gmina

Unnamed: 0,pow,gmina,year,treated,time_to_treatment,funding_cum
0,Bautzen,Przewóz,2014,0,,0.00
1,Bautzen,Przewóz,2015,0,,0.00
2,Bautzen,Przewóz,2016,0,,0.00
3,Bautzen,Przewóz,2017,0,,0.00
4,Bautzen,Przewóz,2018,0,,0.00
...,...,...,...,...,...,...
36171,Žilinský kraj,Zebrzydowice,2023,1,0.0,1802074.13
36172,Žilinský kraj,Zebrzydowice,2024,1,1.0,0.00
36173,Žilinský kraj,Zebrzydowice,2025,1,2.0,0.00
36174,Žilinský kraj,Zebrzydowice,2026,1,3.0,0.00


#### Comine into 1 dataset

In [48]:
df_1420 = pd.read_parquet(r"data\outputs\eu_flows\pol_1420_treatment_staggered_did.parquet")
df_2127 = pd.read_parquet(r"data\outputs\eu_flows\pol_2127_treatment_staggered_did_gmina.parquet")

# 1420 doesn't have gmina, add it as None
if 'gmina' not in df_1420.columns:
    df_1420['gmina'] = None

# Stack the datasets
df_combined = pd.concat([df_1420, df_2127], axis=0, ignore_index=True)

# Sort and ensure panel structure
df_combined = df_combined.sort_values(['pow', 'gmina', 'year']).reset_index(drop=True)

# Create period identifier
df_combined['programme'] = df_combined['year'].apply(lambda x: '2014-2020' if x <= 2020 else '2021-2027')

# Handle treatment timing across periods
df_combined['ever_treated'] = df_combined.groupby(['pow', 'gmina'])['treated'].transform('max')

# Create lead/lag variables using transform (returns aligned Series)
def years_since_treat(x):
    if (x['treated'] == 1).any():
        first_treat_year = x.loc[x['treated'] == 1, 'year'].min()
        return x['year'] - first_treat_year
    else:
        return np.nan

df_combined['years_since_treatment'] = df_combined.groupby(['pow', 'gmina'], group_keys=False).apply(years_since_treat, include_groups=False)

print("Combined dataset shape:", df_combined.shape)
print("Year range:", df_combined['year'].min(), "to", df_combined['year'].max())
print("Treated units:", df_combined['ever_treated'].sum())
display(df_combined.head(15))

Combined dataset shape: (36614, 9)
Year range: 2014 to 2027
Treated units: 32634.0


Unnamed: 0,pow,year,treated,time_to_treatment,funding_cum,gmina,programme,ever_treated,years_since_treatment
0,Biała Podlaska,2014,0,,0.0,Biała Podlaska,2014-2020,1.0,-7.0
1,Biała Podlaska,2015,0,,0.0,Biała Podlaska,2014-2020,1.0,-6.0
2,Biała Podlaska,2016,0,,0.0,Biała Podlaska,2014-2020,1.0,-5.0
3,Biała Podlaska,2017,0,,0.0,Biała Podlaska,2014-2020,1.0,-4.0
4,Biała Podlaska,2018,0,,0.0,Biała Podlaska,2014-2020,1.0,-3.0
5,Biała Podlaska,2019,0,,0.0,Biała Podlaska,2014-2020,1.0,-2.0
6,Biała Podlaska,2020,0,,0.0,Biała Podlaska,2014-2020,1.0,-1.0
7,Biała Podlaska,2021,1,0.0,142284300.0,Biała Podlaska,2021-2027,1.0,0.0
8,Biała Podlaska,2022,1,1.0,184032500.0,Biała Podlaska,2021-2027,1.0,1.0
9,Biała Podlaska,2023,1,2.0,216906500.0,Biała Podlaska,2021-2027,1.0,2.0


In [47]:
df_combined.ever_treated.unique()

array([ 1., nan])

In [None]:
# Save combined dataset
df_combined.to_parquet(r"data\clean\eu_flows\pol_combined_1420_2127_panel.parquet", index=False)

## EU Cohesion funding

In [4]:
keys_path = os.path.join(os.getcwd(), "keys", "api_keys.ipy")
keys = runpy.run_path(keys_path)   
kohesio_key = keys.get("kohesio_key")

In [12]:
url = "https://keep.eu/api/open-data"
params = {"key": kohesio_key}

In [16]:
# Load the JSON file
with open(r"data\inputs\cohesion_data_europa\keep.eu_opendata_202511291224.json", "r", encoding='utf-8') as f:
    data = json.load(f)

programmes = data.get("programmes", [])
df_programmes = pd.json_normalize(programmes)

In [19]:
# print(df_programmes)

In [None]:
projects_list = []
for idx, row in df_programmes.iterrows():
    if isinstance(row['projects'], list):
        for project in row['projects']:
            project_dict = project.copy() if isinstance(project, dict) else {}
            project_dict['programme_id'] = row['id']
            project_dict['programme_title'] = row['title']
            project_dict['programme_period'] = row['period']
            project_dict['eu_funding'] = row['eu_funding']
            projects_list.append(project_dict)

df_projects = pd.DataFrame(projects_list)
print(f"Total projects extracted: {len(df_projects)}")
print(df_projects.head())

In [24]:
# After creating df_projects, flatten nested columns
df_projects_flat = df_projects.copy()

# Convert dict/list columns to JSON strings for CSV storage
for col in df_projects_flat.columns:
    if df_projects_flat[col].dtype == 'object':
        # Check if column contains dicts or lists
        if df_projects_flat[col].apply(lambda x: isinstance(x, (dict, list))).any():
            df_projects_flat[col] = df_projects_flat[col].apply(
                lambda x: json.dumps(x) if isinstance(x, (dict, list)) else x
            )

In [8]:
kohesio = pd.read_csv(r"data\outputs\cohesion_projects_all.csv")

  kohesio = pd.read_csv(r"data\outputs\cohesion_projects_all.csv")


In [15]:
kohesio.tail()

Unnamed: 0,acronym,project_id,translations,themes,call,documents,status,relevant_mentions_and_prizes,relevant_precedent_projects,relevant_subsequent_projects,...,eusalp,eusalp_p_area_horizontal_action,eusalp_degree_of_compliance,hierarchy,number_of_partnerships,partnerships,programme_id,programme_title,programme_period,programme_type
30998,UrbanCOOP,,{'en': {'name': 'Better cooperation governance...,"['Urban development', 'Governance, partnership']",,[],ongoing,,,,...,False,,,,1,"[{'partner_name': '', 'partner_translated_name...",394,2021 - 2027 Interreg VI-C Interreg Europe,2021-2027,Interregional
30999,VIADUCT,,{'en': {'name': 'Valorising public research to...,"['Clustering and economic cooperation', 'Knowl...",,[],ongoing,,,,...,False,,,,1,"[{'partner_name': '', 'partner_translated_name...",394,2021 - 2027 Interreg VI-C Interreg Europe,2021-2027,Interregional
31000,WEEEWaste,,{'en': {'name': 'Improving policies for waste ...,"['Regional planning and development', 'Waste a...",,[],ongoing,,,,...,False,,,,1,[{'partner_name': 'České vysoké učení technick...,394,2021 - 2027 Interreg VI-C Interreg Europe,2021-2027,Interregional
31001,WeSTEMEU,,"{'en': {'name': 'Women for Science, Technology...","['Regional planning and development', 'Social ...",,[],ongoing,,,,...,False,,,,1,"[{'partner_name': '', 'partner_translated_name...",394,2021 - 2027 Interreg VI-C Interreg Europe,2021-2027,Interregional
31002,ZCI,,"{'en': {'name': 'Zero Carbon Infrastructure', ...","['Regional planning and development', 'Climate...",,[],ongoing,,,,...,False,,,,1,"[{'partner_name': '', 'partner_translated_name...",394,2021 - 2027 Interreg VI-C Interreg Europe,2021-2027,Interregional


In [6]:
NUTS2_funding = pd.read_csv(r"data\inputs\cohesion_data_europa\Historic_EU_payments_annual_timeseries_-_regionalised_and_modelled_20251126.csv")

In [7]:
NUTS2_funding

Unnamed: 0,Country,NUTS1_ID,NUTS2 code+name,NUTS2_ID,NUTS2_name,Fund,Year,Programming_Period,EU_Payment_annual,Modelled_annual_expenditure,Standard_Deviation_of_annual_expenditure,Standard_Error_of_modelled_annual_expenditure
0,BE,BE1,BE10 Région de Bruxelles-Capitale / Brussels H...,BE10,Région de Bruxelles-Capitale / Brussels Hoofds...,EAFRD,1993,1989-1993,0,211,24,0
1,BE,BE1,BE10 Région de Bruxelles-Capitale / Brussels H...,BE10,Région de Bruxelles-Capitale / Brussels Hoofds...,EAFRD,1994,1989-1993,589,508,15,0
2,BE,BE1,BE10 Région de Bruxelles-Capitale / Brussels H...,BE10,Région de Bruxelles-Capitale / Brussels Hoofds...,EAFRD,1995,1989-1993,339,256,29,0
3,BE,BE1,BE10 Région de Bruxelles-Capitale / Brussels H...,BE10,Région de Bruxelles-Capitale / Brussels Hoofds...,EAFRD,1996,1989-1993,0,139,38,1
4,BE,BE1,BE10 Région de Bruxelles-Capitale / Brussels H...,BE10,Région de Bruxelles-Capitale / Brussels Hoofds...,EAFRD,1997,1989-1993,521,334,21,0
...,...,...,...,...,...,...,...,...,...,...,...,...
41335,UK,UKZ,UKZZ Extra-Regio NUTS 2,UKZZ,Extra-Regio NUTS 2,ESF,2018,2014-2020,281494,242524,6724,212
41336,UK,UKZ,UKZZ Extra-Regio NUTS 2,UKZZ,Extra-Regio NUTS 2,ESF,2019,2014-2020,131913,924167,136700,4322
41337,UK,UKZ,UKZZ Extra-Regio NUTS 2,UKZZ,Extra-Regio NUTS 2,ESF,2020,2014-2020,3172883,2374081,138053,4365
41338,UK,UKZ,UKZZ Extra-Regio NUTS 2,UKZZ,Extra-Regio NUTS 2,ESF,2021,2014-2020,74662,96915,9286,293


## Elections data 

In [6]:
df = pd.read_csv(r"data\inputs\election_data\Populists_dataset\The PopuList 3.0.csv", delimiter=';')

In [7]:
df

Unnamed: 0,party_name,country_name,party_name_english,party_name_short,populist,populist_start,populist_end,populist_startnobl,populist_endnobl,populist_bl,...,farleft_bl,eurosceptic,eurosceptic_start,eurosceptic_end,eurosceptic_startnobl,eurosceptic_endnobl,eurosceptic_bl,in_parliament,partyfacts_id,parlgov_id
0,Bündnis Zukunft Österreich,Austria,Alliance for the Future of Austria,BZÖ,1,1900,2100,1900,2100,0,...,0,1,1900,2100,1900,2100,0,0.0,599.0,1536.0
1,Freiheitliche Partei Österreichs,Austria,Freedom Party of Austria,FPÖ,1,1900,2100,1900,2100,0,...,0,1,1900,2100,1900,2100,0,1.0,463.0,50.0
2,Liste Hans-Peter Martin,Austria,Hans-Peter Martin's List,Martin,1,1900,2100,1900,2100,0,...,0,1,1900,2100,1900,2100,0,0.0,1708.0,669.0
3,Team Stronach,Austria,Team Stronach,TS,1,1900,2100,1900,2100,0,...,0,1,1900,2100,1900,2100,0,0.0,1971.0,2150.0
4,JETZT - Liste Pilz,Austria,NOW – Pilz List,Pilz,1,1900,2100,2100,2100,1,...,0,0,2100,2100,2100,2100,0,0.0,6137.0,2651.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,solidaritéS,Switzerland,Solidarity,SOL,0,2100,2100,2100,2100,0,...,0,1,1900,2100,1900,2100,0,1.0,2939.0,1226.0
230,Democratic Unionist Party,United Kingdom,Democratic Unionist Party,DUP,1,1900,2100,2100,2100,1,...,0,1,1900,2100,1900,2100,0,1.0,335.0,319.0
231,Respect – The Unity Coalition,United Kingdom,Respect -- The Unity Coalition,R,1,1900,2100,1900,2100,0,...,0,1,1900,2100,1900,2100,0,0.0,1082.0,1057.0
232,United Kingdom Independence Party,United Kingdom,United Kingdom Independence Party,UKIP,1,1900,2100,1900,2100,0,...,0,1,1900,2100,1900,2100,0,0.0,601.0,1272.0


In [14]:
df[df["country_name"] == "Poland"].party_name.unique()

array(['Konfederacja Odnowy Rzeczypospolitej Wolnosc i Nadzieja',
       "Kukiz'15", 'Lewica Razem', 'Liga Polskich Rodzin', 'Partia X',
       'Polski Zwiazek Zachodni', 'Prawo i Sprawiedliwosc',
       'Ruch Narodowy', 'Ruch Odbudowy Polski',
       'Samoobrona Rzeczypospolitej Polskiej', 'Unia Polityki Realnej ',
       'Zjednoczenie Chrzescijansko-Narodowe',
       'Konfederacja Wolność i Niepodległość',
       'Konfederacja Korony Polskiej', 'Solidarna Polska',
       'Kongres Nowej Prawicy'], dtype=object)

In [8]:
elections_df = pd.read_csv(r"data\inputs\election_data\Poland\gminy\wyniki_gl_na_listy_po_gminach_proc_sejm_csv\wyniki_gl_na_listy_po_gminach_proc_sejm_utf8.csv", delimiter=';')

In [None]:
elections_df = elections

Unnamed: 0,TERYT Gminy,Gmina,Powiat,Województwo,Nr okręgu,Liczba komisji,Liczba uwzględnionych komisji,Frekwencja,Procent głosów nieważnych,W tym z powodu postawienia znaku „X” obok nazwiska dwóch lub większej liczby kandydatów z różnych list,...,KOMITET WYBORCZY NOWA LEWICA,KOMITET WYBORCZY PRAWO I SPRAWIEDLIWOŚĆ,KOMITET WYBORCZY KONFEDERACJA WOLNOŚĆ I NIEPODLEGŁOŚĆ,KOALICYJNY KOMITET WYBORCZY KOALICJA OBYWATELSKA PO .N IPL ZIELONI,KOMITET WYBORCZY POLSKA JEST JEDNA,KOMITET WYBORCZY WYBORCÓW RUCHU DOBROBYTU I POKOJU,KOMITET WYBORCZY NORMALNY KRAJ,KOMITET WYBORCZY ANTYPARTIA,KOMITET WYBORCZY RUCH NAPRAWY POLSKI,KOMITET WYBORCZY WYBORCÓW MNIEJSZOŚĆ NIEMIECKA
0,,Albania,zagranica,,19,1,1,9532,049,2500,...,1763,851,752,4661,123,,,,,
1,,Algieria,zagranica,,19,1,1,9091,500,3333,...,1053,1579,877,4737,000,,,,,
2,,Angola,zagranica,,19,1,1,8889,000,,...,833,1667,1667,3750,833,,,,,
3,,Azerbejdżan,zagranica,,19,1,1,10000,230,5000,...,1294,353,588,6000,235,,,,,
4,,Argentyna,zagranica,,19,1,1,8884,1055,9048,...,2584,2022,674,3258,000,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2580,321804.0,gm. Resko,łobeski,zachodniopomorskie,41,8,8,6072,364,6504,...,750,3611,495,3420,089,068,,,,
2581,321805.0,gm. Węgorzyno,łobeski,zachodniopomorskie,41,6,6,6149,445,6884,...,740,4010,639,2764,051,084,,,,
2582,326101.0,Koszalin,Koszalin,zachodniopomorskie,40,55,55,7599,135,6143,...,879,2540,571,4576,098,,,,,
2583,326201.0,Szczecin,Szczecin,zachodniopomorskie,41,208,208,7788,105,4998,...,978,2525,589,4423,112,024,,,,
