# 💼 Title: Merging DARES tension indicators with Departements Geospatial  Data


📌 Overview
This notebook merges DARES labor market tension indicators for professions (FAP 87 and FAP 225) at the department level with geospatial department data.

In [13]:
# IMPORT LIBRARIES
import matplotlib.pyplot as plt
import pandas as pd
import re # For regular expression
import geopandas as gpd # To read geospatial data
from pathlib import Path # To set relative paths
import unidecode # To standardize strings
import py7zr # To unzip files

In [14]:
# GETTING PROJECT'S ROOT DIRECTORY
base_folder = Path().resolve()  # CURRENT WORKING DIRECTORY
main_folder = base_folder.parent

In [None]:
# EXTRACTING DEPARTEMENTS ZIPPED SHAPEFILE
seven_zip_path = main_folder / "data" / "shapefiles" / "Departements" / "departements-20180101.shp"
extract_dir = main_folder / "data" / "shapefiles" / "Departements"

with py7zr.SevenZipFile(seven_zip_path, mode='r') as archive:
    archive.extractall(path=extract_dir)

In [None]:
# SETTING ALL NECESSARY DIRECTORIES
shapefile_path = main_folder / "data" / "shapefiles" / "Departements" / "departements-20180101.shp"

# please change here the path after uploading the excel to github
indicators_path = main_folder / "data" / "Dares_données_tensions_2022_.xlsx"

output_path = main_folder / "data" / "2- Formatted Data" / "matched_dares_indicators_departements.csv"

In [None]:
# IMPORTING FILES
departements_shp = gpd.read_file(shapefile_path)
indicators_fap87_df = pd.read_excel(indicators_path, sheet_name="FAP 87 x DEP")
indicators_fap225_df = pd.read_excel(indicators_path, sheet_name="FAP 225 x DEP") 

In [None]:
# EXPLORING GEO FILE AND SMALL ADJUSTMENTS
departements_shp.rename(columns={"nom": "departement"}, inplace=True)
departements_shp.head()

Unnamed: 0,Année,Code FAP 225,Libellé FAP 225,Code département,Libellé département,Emploi moyen (2015-2019),Tension,Tension (Valeurs imputées: volumétrie insuffisantes),Intensité d'embauches,Lien formation-emploi,...,Conditions de travail contraignantes,Inadéquation géographique,Tension - discret,Intensité d'embauches - discret,Lien formation-emploi - discret,Manque de main d'oeuvre disponible - discret,Non-durabilité de l'emploi - discret,Conditions de travail contraignantes - discret,Inadéquation géographique - discret,Croisement où volumétrie suffisante
0,2011,A0Z40,Agriculteurs salariés,1,AIN,458.31432,n.d.,0.479026,n.d.,n.d.,...,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,FAP87 - DEP
1,2011,A0Z40,Agriculteurs salariés,2,AISNE,1263.645583,-0.578154,-0.578154,0.707496,-0.199432,...,0.436598,1.098667,2,5,3,4,5,4,5,FAP225 - DEP
2,2011,A0Z40,Agriculteurs salariés,3,ALLIER,459.747956,-0.89294,-0.89294,0.721193,-0.199432,...,0.436598,1.098667,1,5,3,3,5,4,5,FAP225 - DEP
3,2011,A0Z40,Agriculteurs salariés,4,ALPES-DE-HAUTE-PROVENCE,330.000757,0.168244,0.168244,7.86582,-0.199432,...,0.436598,1.098667,4,5,3,3,5,4,5,FAP225 - DEP
4,2011,A0Z40,Agriculteurs salariés,5,HAUTES-ALPES,238.75679,n.d.,-0.237149,n.d.,n.d.,...,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,FAP87 - DEP


In [36]:
# EXPLORING INDICATORS FILES AND SMALL ADJUSTMENTS
indicators_fap87_df.rename(columns={"Libellé département": "departement_nom"}, inplace=True)
indicators_fap87_df.head()
indicators_fap225_df.rename(columns={"Libellé département": "departement_nom"}, inplace=True)
indicators_fap225_df.head()

Unnamed: 0,Année,Code FAP 225,Libellé FAP 225,Code département,departement_nom,Emploi moyen (2015-2019),Tension,Tension (Valeurs imputées: volumétrie insuffisantes),Intensité d'embauches,Lien formation-emploi,...,Conditions de travail contraignantes,Inadéquation géographique,Tension - discret,Intensité d'embauches - discret,Lien formation-emploi - discret,Manque de main d'oeuvre disponible - discret,Non-durabilité de l'emploi - discret,Conditions de travail contraignantes - discret,Inadéquation géographique - discret,Croisement où volumétrie suffisante
0,2011,A0Z40,Agriculteurs salariés,1,AIN,458.31432,n.d.,0.479026,n.d.,n.d.,...,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,FAP87 - DEP
1,2011,A0Z40,Agriculteurs salariés,2,AISNE,1263.645583,-0.578154,-0.578154,0.707496,-0.199432,...,0.436598,1.098667,2,5,3,4,5,4,5,FAP225 - DEP
2,2011,A0Z40,Agriculteurs salariés,3,ALLIER,459.747956,-0.89294,-0.89294,0.721193,-0.199432,...,0.436598,1.098667,1,5,3,3,5,4,5,FAP225 - DEP
3,2011,A0Z40,Agriculteurs salariés,4,ALPES-DE-HAUTE-PROVENCE,330.000757,0.168244,0.168244,7.86582,-0.199432,...,0.436598,1.098667,4,5,3,3,5,4,5,FAP225 - DEP
4,2011,A0Z40,Agriculteurs salariés,5,HAUTES-ALPES,238.75679,n.d.,-0.237149,n.d.,n.d.,...,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,n.d.,FAP87 - DEP


# The objective is to match our "departement_nom" columns in each indicator dataframe (fap87 and fap225) with the "departement" column to align geospatial information for future mapping.

1- STANDARDIZATION OF DEPARTEMENTS COLUMN FROM ALL DATASETS

In [41]:
# Standardization function for department names
def standardize_dep(name):
    if pd.isna(name):  # Handle missing values
        return None
    name = unidecode.unidecode(name.lower().strip())  # Remove accents, lowercase, and strip whitespace
    name = re.sub(r"[-'’]", " ", name)  # Replace hyphens and apostrophes with spaces
    name = re.sub(r"\bst[ .]", "saint ", name)  # Standardize "St." or "St-" to "Saint"
    name = re.sub(r"\s+", " ", name)  # Replace multiple spaces with a single space
    return name

# Apply standardization
indicators_fap87_df["departement_nom"] = indicators_fap87_df["departement_nom"].apply(standardize_dep)
indicators_fap225_df["departement_nom"] = indicators_fap225_df["departement_nom"].apply(standardize_dep)
departements_shp["departement"] = departements_shp["departement"].apply(standardize_dep)

2- MERGE DATASETS

In [46]:
# Merge datasets
indicators_fap87_df_merged = indicators_fap225_df.merge(departements_shp, left_on="departement_nom", right_on="departement", how="left")
indicators_fap87_df_merged.head(5)

indicators_fap225_df_merged = indicators_fap225_df.merge(departements_shp, left_on="departement_nom", right_on="departement", how="left")
indicators_fap225_df_merged.head(5)

Unnamed: 0,Année,Code FAP 225,Libellé FAP 225,Code département,departement_nom,Emploi moyen (2015-2019),Tension,Tension (Valeurs imputées: volumétrie insuffisantes),Intensité d'embauches,Lien formation-emploi,...,Non-durabilité de l'emploi - discret,Conditions de travail contraignantes - discret,Inadéquation géographique - discret,Croisement où volumétrie suffisante,code_insee,departement,nuts3,wikipedia,surf_km2,geometry
0,2011,A0Z40,Agriculteurs salariés,1,ain,458.31432,n.d.,0.479026,n.d.,n.d.,...,n.d.,n.d.,n.d.,FAP87 - DEP,1,ain,FR711,fr:Ain (département),5784.0,"POLYGON ((4.7282 45.946, 4.7282 45.948, 4.7287..."
1,2011,A0Z40,Agriculteurs salariés,2,aisne,1263.645583,-0.578154,-0.578154,0.707496,-0.199432,...,5,4,5,FAP225 - DEP,2,aisne,FR221,fr:Aisne (département),7411.0,"POLYGON ((2.958 49.227, 2.958 49.227, 2.9582 4..."
2,2011,A0Z40,Agriculteurs salariés,3,allier,459.747956,-0.89294,-0.89294,0.721193,-0.199432,...,5,4,5,FAP225 - DEP,3,allier,FR721,fr:Allier (département),7379.0,"POLYGON ((2.2768 46.425, 2.2769 46.425, 2.277 ..."
3,2011,A0Z40,Agriculteurs salariés,4,alpes de haute provence,330.000757,0.168244,0.168244,7.86582,-0.199432,...,5,4,5,FAP225 - DEP,4,alpes de haute provence,FR821,fr:Alpes-de-Haute-Provence,6993.0,"POLYGON ((5.4964 44.103, 5.4973 44.104, 5.4975..."
4,2011,A0Z40,Agriculteurs salariés,5,hautes alpes,238.75679,n.d.,-0.237149,n.d.,n.d.,...,n.d.,n.d.,n.d.,FAP87 - DEP,5,hautes alpes,FR822,fr:Hautes-Alpes,5697.0,"POLYGON ((5.4184 44.425, 5.4185 44.425, 5.4186..."


3- WHICH DEPARTEMENTS WERE NOT MATCHED AND WHY?

In [47]:
# Extract unique unmatched commune names as a list
unique_unmatched_dep_87_list = indicators_fap87_df_merged.loc[indicators_fap87_df_merged["departement"].isna(), "departement_nom"].drop_duplicates().tolist()
unique_unmatched_dep_225_list  = indicators_fap225_df_merged.loc[indicators_fap225_df_merged["departement"].isna(), "departement_nom"].drop_duplicates().tolist()
# Print the list
print(unique_unmatched_dep_87_list)
print(unique_unmatched_dep_225_list)

['reunion']
['reunion']


In [51]:
# Since only 1 departement was not matched, we will manually map it
manual_mapping = {'reunion': 'la reunion'}

# Apply manual mapping to unmatched names in FAP 87 and FAP 225 datasets
indicators_fap87_df_merged['departement_nom'] = indicators_fap87_df_merged['departement_nom'].replace(manual_mapping)
indicators_fap87_df_merged['departement_nom'] = indicators_fap87_df_merged['departement_nom'].replace(manual_mapping)# 

4- QUICK VISUALISATION FOR EACH FAP DATAFRAME

In [None]:
# Reconvert into geospatial dataframe
gdf = gpd.GeoDataFrame(indicators_fap87_df_merged, geometry="geometry")

# Define approximate bounding box for mainland France & Corsica
france_bounds = (-5, 10, 41, 52)  # (xmin, xmax, ymin, ymax)

# Filter to keep only polygons within this bounding box
gdf_mainland = gdf.cx[france_bounds[0]:france_bounds[1], france_bounds[2]:france_bounds[3]]

# Plot with very thin edges
fig, ax = plt.subplots(figsize=(8, 10))
gdf_mainland.plot(ax=ax, edgecolor="black", linewidth=0.1, alpha=0.5)

# Remove axis labels for cleaner visualization
ax.set_xticks([])
ax.set_yticks([])
ax.set_title("Departements in Metropolitan France")

plt.show()

In [None]:
# Reconvert into geospatial dataframe
gdf = gpd.GeoDataFrame(indicators_fap225_df_merged, geometry="geometry")

# Define approximate bounding box for mainland France & Corsica
france_bounds = (-5, 10, 41, 52)  # (xmin, xmax, ymin, ymax)

# Filter to keep only polygons within this bounding box
gdf_mainland = gdf.cx[france_bounds[0]:france_bounds[1], france_bounds[2]:france_bounds[3]]

# Plot with very thin edges
fig, ax = plt.subplots(figsize=(8, 10))
gdf_mainland.plot(ax=ax, edgecolor="black", linewidth=0.1, alpha=0.5)

# Remove axis labels for cleaner visualization
ax.set_xticks([])
ax.set_yticks([])
ax.set_title("Departements in Metropolitan France")

plt.show()