In [6]:
import geopandas as gpd
import pandas as pd
from eurocalliopelib.utils import eu_country_code_to_iso3

idx = pd.IndexSlice

FREIGHT_SECTORS = {
    'GT03': 'Mining and quarrying',
    'GT04': 'Food, beverages and tobacco',
    'GT05': 'Textiles and leather',
    'GT06': 'Wood and wood products',
    'GT11': 'Machinery Equipment',
    'GT12': 'Transport Equipment',
    'GT13': 'Other Industrial Sectors'
}

In [7]:
def to_numeric(series):
    series = series.astype(str).str.extract(r'(\-*\d+\.*\d*)')[0]
    return pd.to_numeric(series, errors='coerce')

def read_eurostat_tsv(path_to_tsv, index_names, slice_idx=None, slice_lvl=None):
    df = pd.read_csv(path_to_tsv, delimiter='\t', index_col=0)
    df.index = df.index.str.split(',', expand=True).rename(index_names)
    if slice_idx is not None:
        df = df.xs(slice_idx, level=slice_lvl)
    df.columns = df.columns.astype(int).rename("year")
    return df.apply(to_numeric)

In [8]:
nuts_2006 = gpd.read_file("modules/industry/resources/eurostat/nuts_2006.geojson")
nuts_2006["nuts_id_eu"] = nuts_2006.NUTS_ID.str.replace("GR", "EL")


In [9]:
freight_df = read_eurostat_tsv("modules/industry/resources/eurostat/freight.tsv.gz", ["subsector", "unit", "region"])
subregions = set(freight_df.index.get_level_values("region").unique()) & set(nuts_2006.NUTS_ID.unique())
freight_eu = (
        freight_df
        .unstack()
        .groupby(FREIGHT_SECTORS, level=0).sum()
        .where(lambda x: x > 0)
        .loc[:, idx[:, list(subregions)]]
        .stack([0, 1])
    )
freight_eu = (
        freight_eu
        .to_frame('freight')
        .set_index(freight_eu.index.get_level_values('region').str[:-1], append=True)
        .rename_axis(index=['subsector', 'year', 'nuts3', 'nuts2'])
    )
freight_eu

  .stack([0, 1])


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,freight
subsector,year,nuts3,nuts2,Unnamed: 4_level_1
"Food, beverages and tobacco",2008,CH023,CH02,1246.0
"Food, beverages and tobacco",2008,DE80H,DE80,109.0
"Food, beverages and tobacco",2008,ITD55,ITD5,3184.0
"Food, beverages and tobacco",2008,DE229,DE22,75.0
"Food, beverages and tobacco",2008,AT314,AT31,171.0
...,...,...,...,...
Wood and wood products,2019,ITG26,ITG2,39.0
Wood and wood products,2019,DE275,DE27,338.0
Wood and wood products,2019,HU212,HU21,148.0
Wood and wood products,2019,ES423,ES42,76.0


In [10]:
industry_employees = read_eurostat_tsv(
    "modules/industry/resources/eurostat/employees.tsv.gz", ['cat_code', 'indicator', 'region'],
    slice_idx='V16110', slice_lvl='indicator'
)


activity_codes_df = pd.read_csv(
        "modules/industry/workflow/internal/industry_activity_codes.csv", skipfooter=7, index_col=0, header=0, engine='python'
    ).dropna(subset=['Eurostat sector'])

In [7]:
industry_employees

Unnamed: 0_level_0,year,2018,2017,2016,2015,2014,2013,2012,2011,2010,2009,2008
cat_code,region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
B,AT,7389.0,7344.0,7127.0,7089.0,7215.0,7114.0,7220.0,7309.0,7420.0,7583.0,7767.0
B,AT1,2163.0,2240.0,2285.0,2343.0,,,2255.0,2400.0,2357.0,2498.0,2542.0
B,AT11,,,,232.0,,,248.0,248.0,262.0,258.0,292.0
B,AT12,1810.0,1823.0,1865.0,1898.0,,,1801.0,1953.0,1910.0,2050.0,2086.0
B,AT13,,,,213.0,,,206.0,199.0,185.0,190.0,164.0
...,...,...,...,...,...,...,...,...,...,...,...,...
S95,UKM7,,1006.0,1770.0,,,,,,,,
S95,UKM8,,1442.0,2394.0,,,,,,,,
S95,UKM9,,304.0,306.0,,,,,,,,
S95,UKN,,883.0,1751.0,781.0,509.0,554.0,,545.0,553.0,704.0,386.0


In [11]:
industry_employees = industry_employees.reset_index()
industry_employees = industry_employees[industry_employees["cat_code"].isin(activity_codes_df['Eurostat sector'].dropna().index)]
industry_employees = industry_employees.set_index(["cat_code", "region"]).unstack().groupby(activity_codes_df['Eurostat sector'].to_dict()).sum(min_count=1).stack(["year", "region"]).rename_axis(index=['cat_name', 'year', 'region'])

  industry_employees = industry_employees.set_index(["cat_code", "region"]).unstack().groupby(activity_codes_df['Eurostat sector'].to_dict()).sum(min_count=1).stack(["year", "region"]).rename_axis(index=['cat_name', 'year', 'region'])
  industry_employees = industry_employees.set_index(["cat_code", "region"]).unstack().groupby(activity_codes_df['Eurostat sector'].to_dict()).sum(min_count=1).stack(["year", "region"]).rename_axis(index=['cat_name', 'year', 'region'])


In [12]:
industry_employees

cat_name                year  region
Chemicals Industry      2008  AT        27922.0
                              AT1       11597.0
                              AT11        379.0
                              AT12       5006.0
                              AT13       6212.0
                                         ...   
Wood and wood products  2018  SK0       25144.0
                              SK01       1409.0
                              SK02       6275.0
                              SK03      10566.0
                              SK04       6894.0
Length: 41430, dtype: float64

In [9]:
# Combine freight and employee data
freight_employees = pd.concat([
    freight_eu['freight'].reset_index('nuts3'),
    industry_employees.reindex(freight_eu.droplevel('nuts3').index).to_frame('employees_nuts2')
], axis=1).set_index('nuts3', append=True)

freight_employees = freight_employees.reset_index()
employees = industry_employees.reset_index()

In [16]:
freight_employees.sort_values(['subsector', 'year', 'nuts2', 'nuts3'])

Unnamed: 0,subsector,year,nuts2,nuts3,freight,employees_nuts2
611,"Food, beverages and tobacco",2008,AT11,AT111,186.0,2871.0
272,"Food, beverages and tobacco",2008,AT11,AT112,193.0,2871.0
106,"Food, beverages and tobacco",2008,AT11,AT113,184.0,2871.0
791,"Food, beverages and tobacco",2008,AT12,AT121,936.0,16692.0
1141,"Food, beverages and tobacco",2008,AT12,AT122,219.0,16692.0
...,...,...,...,...,...,...
90102,Wood and wood products,2019,UKL2,UKL24,142.0,
90512,Wood and wood products,2019,UKM5,UKM50,53.0,
89780,Wood and wood products,2019,UKM6,UKM61,85.0,
90097,Wood and wood products,2019,UKM6,UKM62,359.0,
