### Imports

In [1]:
import pandas as pd
import glob
import datetime
import os, shutil
import numpy as np
import swifter
import math

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


### Path set-up

In [2]:
if "DATA_DIR" not in locals():
    DATA_DIR = "./data/"
else:
    print(DATA_DIR)

if os.path.exists(DATA_DIR) and os.path.isdir(DATA_DIR):
    shutil.rmtree(DATA_DIR)
os.makedirs(os.path.dirname(DATA_DIR), exist_ok=True)

In [3]:
if "OUTPUT_DATA_FOLDER" not in locals():
    OUTPUT_DATA_FOLDER = "./output/"
else:
    print(OUTPUT_DATA_FOLDER)

if os.path.exists(OUTPUT_DATA_FOLDER) and os.path.isdir(OUTPUT_DATA_FOLDER):
    shutil.rmtree(OUTPUT_DATA_FOLDER)
os.makedirs(os.path.dirname(OUTPUT_DATA_FOLDER), exist_ok=True)

In [4]:
if "ELASTIC_INDEX" not in locals():
    ELASTIC_INDEX = "siren"
else:
    print(ELASTIC_INDEX)

# Établissement

In [5]:
# Create list of departement zip codes
all_deps = [
    *"-0".join(list(str(x) for x in range(0, 10))).split("-")[1:],
    *list(str(x) for x in range(10, 20)),
    *["2A", "2B"],
    *list(str(x) for x in range(21, 95)),
    *"-7510".join(list(str(x) for x in range(0, 10))).split("-")[1:],
    *"-751".join(list(str(x) for x in range(10, 21))).split("-")[1:],
    *[""],
]
# Remove Paris zip code
all_deps.remove("75")

In [7]:
%%time
# Upload geo data by departement
for dep in all_deps:
    url = "https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_" + dep + ".csv.gz"
    print(url)
    df_dep = pd.read_csv(
        url,
        compression="gzip",
        dtype=str,
        usecols=[
            "siren",
            "siret",
            "dateCreationEtablissement",
            "trancheEffectifsEtablissement",
            "activitePrincipaleRegistreMetiersEtablissement",
            "etablissementSiege",
            "numeroVoieEtablissement",
            "libelleVoieEtablissement",
            "codePostalEtablissement",
            "libelleCommuneEtablissement",
            "typeVoieEtablissement",
            "codeCommuneEtablissement",
            "complementAdresseEtablissement",
            "dateDebut",
            "etatAdministratifEtablissement",
            "enseigne1Etablissement",
            "enseigne1Etablissement",
            "enseigne2Etablissement",
            "enseigne3Etablissement",
            "denominationUsuelleEtablissement",
            "activitePrincipaleEtablissement",
            "geo_adresse",
            "longitude",
            "latitude",
            "indiceRepetitionEtablissement",
        ],
    )
    df_dep = df_dep.rename(
        columns={
            "dateCreationEtablissement": "date_creation",
            "trancheEffectifsEtablissement": "tranche_effectif_salarie",
            "activitePrincipaleRegistreMetiersEtablissement": "activite_principale_registre_metier",
            "etablissementSiege": "is_siege",
            "numeroVoieEtablissement": "numero_voie",
            "typeVoieEtablissement": "type_voie",
            "libelleVoieEtablissement": "libelle_voie",
            "codePostalEtablissement": "code_postal",
            "libelleCommuneEtablissement": "libelle_commune",
            "codeCommuneEtablissement": "commune",
            "complementAdresseEtablissement": "complement_adresse",
            "codeCedexEtablissement": "cedex",
            "dateDebut": "date_debut_activite",
            "etatAdministratifEtablissement": "etat_administratif_etablissement",
            "enseigne1Etablissement": "enseigne_1",
            "enseigne2Etablissement": "enseigne_2",
            "enseigne3Etablissement": "enseigne_3",
            "activitePrincipaleEtablissement": "activite_principale",
            "indiceRepetitionEtablissement": "indice_repetition",
            "denominationUsuelleEtablissement": "nom_commercial",
        }
    )
    df_dep.to_csv(DATA_DIR + "geo_siret_" + dep + ".csv", index=False)

https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_23.csv.gz
CPU times: user 846 ms, sys: 52.2 ms, total: 898 ms
Wall time: 1.03 s


In [8]:
# Get geo data file paths
geo_files = glob.glob(DATA_DIR + "geo_siret*.csv")

In [9]:
geo_files.sort()

In [10]:
%%time
# Compute nbre d'établissements' per 'siren'
df_out = pd.DataFrame()
for geo_file in geo_files:
    print(geo_file)
    df_geo = pd.read_csv(geo_file, dtype=str)
    df_geo = df_geo.replace({np.nan: None})
    df_geo["file"] = geo_file
    # Create column with list of "enseignes" and "nom_commercial"
    df_geo["enseigne"] = df_geo.apply(
        lambda x: list(
            filter(
                None,
                set(
                    [
                        x["enseigne_1"],
                        x["enseigne_2"],
                        x["enseigne_3"],
                        x["nom_commercial"],
                    ]
                ),
            )
        ),
        axis=1,
    )
    df_geo["nombre_etablissements"] = df_geo.groupby(["siren", "file"])[
        "siret"
    ].transform("count")
    df_enseigne = (
        df_geo.groupby(["siren", "file"])["enseigne"]
        .apply(list)
        .reset_index(name="liste_enseigne_dep")
        .drop(columns=["file"], axis=1)
    )
    df_enseigne["liste_enseigne_dep"] = df_enseigne.apply(
        lambda x: list(set(c for b in x.liste_enseigne_dep for c in b)), axis=1
    )
    df_geo = df_geo.merge(df_enseigne, left_on="siren", right_on="siren")
    df_adresse = (
        df_geo.groupby(["siren", "file"])["geo_adresse"]
        .apply(set)
        .reset_index(name="liste_adresse_dep")
        .drop(columns=["file"], axis=1)
    )
    df_geo = df_geo.merge(df_adresse, left_on="siren", right_on="siren")
    df_inter = df_geo[
        [
            "siren",
            "file",
            "nombre_etablissements",
            "liste_enseigne_dep",
            "liste_adresse_dep",
        ]
    ]
    df_out = pd.concat([df_out, df_inter])

./data/geo_siret_23.csv
CPU times: user 5.08 s, sys: 7.5 ms, total: 5.09 s
Wall time: 5.09 s


In [11]:
df_out.head(5)

Unnamed: 0,siren,file,nombre_etablissements,liste_enseigne_dep,liste_adresse_dep
0,38822102,./data/geo_siret_23.csv,1,[],{None}
1,39016357,./data/geo_siret_23.csv,1,[],{None}
2,39027305,./data/geo_siret_23.csv,1,[],{Rue de l’Etang 23190 Bellegarde-en-Marche}
3,39315312,./data/geo_siret_23.csv,1,[],{23 Grande Rue 23800 Dun-le-Palestel}
4,56810336,./data/geo_siret_23.csv,1,[],{4 Avenue Charles de Gaulle 23000 Guéret}


In [12]:
df_out = df_out.drop_duplicates(subset=["siren", "file"], keep="first")
df_liste = (
    df_out.groupby(["siren"])["liste_enseigne_dep"]
    .apply(list)
    .reset_index(name="liste_enseigne")
)
df_out = df_out.merge(df_liste, left_on="siren", right_on="siren")
df_liste = (
    df_out.groupby(["siren"])["liste_adresse_dep"]
    .apply(list)
    .reset_index(name="liste_adresse")
)
df_out = df_out.merge(df_liste, left_on="siren", right_on="siren")

In [13]:
df_out2 = (
    df_out[["siren", "nombre_etablissements"]].groupby(["siren"], as_index=False).sum()
)

In [14]:
df_out2 = df_out2.merge(
    df_out[["liste_enseigne", "liste_adresse", "siren"]], on="siren", how="left"
)
df_out2 = df_out2.drop_duplicates(subset=["siren"], keep="first")

In [15]:
df_out2["liste_enseigne"] = df_out2.apply(
    lambda x: list(set(c for b in x.liste_enseigne for c in b)), axis=1
)
df_out2["liste_adresse"] = df_out2.apply(
    lambda x: list(set(c for b in x.liste_adresse for c in b)), axis=1
)

In [16]:
df_out2.head(3)

Unnamed: 0,siren,nombre_etablissements,liste_enseigne,liste_adresse
0,38822102,1,[],[None]
1,39016357,1,[],[None]
2,39027305,1,[],[Rue de l’Etang 23190 Bellegarde-en-Marche]


# Unité Légale

In [17]:
%%time
# Import Stock Unite Legale data
df_unite_legale = pd.read_csv(
    "https://files.data.gouv.fr/insee-sirene/StockUniteLegale_utf8.zip",
    compression="zip",
    dtype=str,
    usecols=[
        "siren",
        "dateCreationUniteLegale",
        "sigleUniteLegale",
        "prenom1UniteLegale",
        "identifiantAssociationUniteLegale",
        "trancheEffectifsUniteLegale",
        "dateDernierTraitementUniteLegale",
        "categorieEntreprise",
        "etatAdministratifUniteLegale",
        "nomUniteLegale",
        "nomUsageUniteLegale",
        "denominationUniteLegale",
        "categorieJuridiqueUniteLegale",
        "activitePrincipaleUniteLegale",
        "economieSocialeSolidaireUniteLegale",
    ],
)

CPU times: user 50.9 s, sys: 4.01 s, total: 54.9 s
Wall time: 58.5 s


In [18]:
%%time
# Rename columns
df_unite_legale = df_unite_legale.rename(
    columns={
        "dateCreationUniteLegale": "date_creation_entreprise",
        "sigleUniteLegale": "sigle",
        "prenom1UniteLegale": "prenom",
        "trancheEffectifsUniteLegale": "tranche_effectif_salarie_entreprise",
        "dateDernierTraitementUniteLegale": "date_mise_a_jour",
        "categorieEntreprise": "categorie_entreprise",
        "etatAdministratifUniteLegale":"etat_administratif_unite_legale",
        "nomUniteLegale": "nom",
        "nomUsageUniteLegale": "nom_usage",
        "denominationUniteLegale": "nom_raison_sociale",
        "categorieJuridiqueUniteLegale": "nature_juridique_entreprise",
        "activitePrincipaleUniteLegale": "activite_principale_entreprise",
        "economieSocialeSolidaireUniteLegale":"economie_sociale_solidaire_unite_legale",
        "identifiantAssociationUniteLegale":"identifiant_association_unite_legale",
    }
)

CPU times: user 1.46 s, sys: 131 ms, total: 1.59 s
Wall time: 1.57 s


### Data preprocessing

In [19]:
def nom_complet(x):
    if x["nature_juridique_entreprise"] == "1000":
        if x["sigle"] == x["sigle"]:
            if (x["prenom"] == x["prenom"]) & (x["nom"] == x["nom"]):
                if x["nom usage"] == x["nom_usage"]:
                    return (
                        x["prenom"].lower()
                        + " "
                        + x["nom_usage"].lower()
                        + " ("
                        + x["nom"].lower()
                        + ", "
                        + x["sigle"].lower()
                        + ")"
                    )
                else:
                    return (
                        x["prenom"].lower()
                        + " "
                        + x["nom"].lower()
                        + " ("
                        + x["sigle"].lower()
                        + ")"
                    )
            else:
                return None
        else:
            if (x["prenom"] == x["prenom"]) & (x["nom"] == x["nom"]):
                if x["nom_usage"] == x["nom_usage"]:
                    return (
                        x["prenom"].lower()
                        + " "
                        + x["nom_usage"].lower()
                        + " ("
                        + x["nom"].lower()
                        + ")"
                    )
                else:
                    return x["prenom"].lower() + " " + x["nom"].lower()
            else:
                return None
    else:
        if x["sigle"] == x["sigle"]:
            if x["nom_raison_sociale"] == x["nom_raison_sociale"]:
                return x["nom_raison_sociale"].lower() + " (" + x["sigle"].lower() + ")"
            else:
                return None
        else:
            if x["nom_raison_sociale"] == x["nom_raison_sociale"]:
                return x["nom_raison_sociale"].lower()
            else:
                return None

In [20]:
%%time
# Add nom_complet column to df_unite_legale
# swifter.allow_dask_on_strings()
df_unite_legale["nom_complet"] = df_unite_legale.apply(
    lambda row: nom_complet(row), axis=1
)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.2 µs


'\ndf_unite_legale["nom_complet"] = df_unite_legale.swifter.allow_dask_on_strings().apply(\n    lambda row: nom_complet(row), axis=1\n)\n'

# Merge unité légale et établissements


In [21]:
df_unite_legale = pd.merge(df_unite_legale, df_out2, on="siren", how="left")

In [22]:
df_unite_legale.head(3)

Unnamed: 0,siren,date_creation_entreprise,sigle,prenom,identifiant_association_unite_legale,tranche_effectif_salarie_entreprise,date_mise_a_jour,categorie_entreprise,etat_administratif_unite_legale,nom,nom_usage,nom_raison_sociale,nature_juridique_entreprise,activite_principale_entreprise,economie_sociale_solidaire_unite_legale,nombre_etablissements,liste_enseigne,liste_adresse
0,325175,2000-09-26,,THIERRY,,,2019-12-13T13:21:28,PME,A,JANOYER,,,1000,32.12Z,,,,
1,1807254,1972-05-01,,JACQUES-LUCIEN,,,2016-07-10T05:00:06,,C,BRETON,,,1000,85.59A,,,,
2,5410220,1954-12-25,,GEORGES,,,,,C,WATTEBLED,,,1000,22.02,,,,


# Nombre établissements

In [23]:
# Compute 'nombre etablissements ouvert' per 'siren'
df_out = pd.DataFrame()
for geo_file in geo_files:
    print(geo_file)
    df_geo = pd.read_csv(geo_file, dtype=str)
    df_geo = df_geo[df_geo["etat_administratif_etablissement"] == "A"]
    df_geo["file"] = geo_file
    df_geo["nombre_etablissements_ouvert"] = df_geo.groupby(["siren", "file"])[
        "siret"
    ].transform("count")
    df_inter = df_geo[["siren", "file", "nombre_etablissements_ouvert"]]
    df_out = pd.concat([df_out, df_inter])
df_out = df_out.drop_duplicates(keep="first")
df_out2 = (
    df_out[["siren", "nombre_etablissements_ouvert"]]
    .groupby(["siren"], as_index=False)
    .sum()
)
df_unite_legale = pd.merge(df_unite_legale, df_out2, on="siren", how="left")

./data/geo_siret_23.csv


In [24]:
# Merge geo files with above dataframe and add is_entrepreneur_individuel
for geo_file in geo_files:
    print(geo_file)
    df_geo = pd.read_csv(geo_file, dtype=str)
    df_inter = pd.merge(df_geo, df_unite_legale, on="siren", how="left")
    df_inter2 = df_inter[df_inter["is_siege"] == "true"]
    df_inter2["concat_nom_adr_siren"] = (
        df_inter2["nom_complet"]
        + " "
        + df_inter2["geo_adresse"]
        + " "
        + df_inter2["siren"]
    )
   
    # df_inter2['concat_enseigne_adresse'] = df_inter2.apply(lambda x: set.union(x.liste_enseigne, x.liste_adresse), axis=1)
    df_inter2["concat_enseigne_adresse"] = (
        df_inter2["liste_enseigne"] + df_inter2["liste_adresse"]
    )
    
    df_inter2["is_entrepreneur_individuel"] = df_inter2.apply(lambda x: True if x.nature_juridique_entreprise in ['1', '10', '1000'] else False, axis=1) # entrepreneur individuel
    df_inter2["coordonnees"] = df_inter2.apply(lambda x: None if ((x.latitude!=x.latitude) or (x.longitude!=x.longitude)) else (x.latitude + "," + x.longitude) , axis=1)
    '''
    df_inter.to_csv(
        OUTPUT_DATA_FOLDER + "siret_" + geo_file.replace(DATA_DIR + "geo_siret_", ""),
        index=False,
    )
    '''
    df_inter2.to_csv(
        OUTPUT_DATA_FOLDER
        + ELASTIC_INDEX
        + "_"
        + geo_file.replace(DATA_DIR + "geo_siret_", ""),
        index=False,
    )

./data/geo_siret_23.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_inter2["concat_enseigne_adresse"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_inter2["is_entrepreneur_individuel"] = df_inter2.apply(lambda x: True if x.nature_juridique_entreprise in ['1', '10', '1000'] else False, axis=1) # entrepreneur individuel
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versu

In [25]:
df_inter2.head(5)

Unnamed: 0,siren,siret,date_creation,tranche_effectif_salarie,activite_principale_registre_metier,is_siege,complement_adresse,numero_voie,indice_repetition,type_voie,...,nature_juridique_entreprise,activite_principale_entreprise,economie_sociale_solidaire_unite_legale,nombre_etablissements,liste_enseigne,liste_adresse,nombre_etablissements_ouvert,concat_enseigne_adresse,is_entrepreneur_individuel,coordonnees
0,38822102,3882210200018,1997-12-25,,,True,,,,,...,9110,81.10Z,,1.0,[],[None],1.0,[None],False,
1,39016357,3901635700012,,,,True,,,,,...,9110,70.3C,,1.0,[],[None],,[None],False,
2,39027305,3902730500018,,,,True,,,,RUE,...,9110,70.3C,,1.0,[],[Rue de l’Etang 23190 Bellegarde-en-Marche],,[Rue de l’Etang 23190 Bellegarde-en-Marche],False,"2.292848,45.981419"
3,39315312,3931531200015,1997-12-25,,,True,,23.0,,GR,...,9110,81.10Z,,1.0,[],[23 Grande Rue 23800 Dun-le-Palestel],1.0,[23 Grande Rue 23800 Dun-le-Palestel],False,"1.66628,46.30549"
14,86003100,8600310000020,1995-01-01,NN,,True,,,,,...,1000,01.45Z,,1.0,[],[Puy Bessat 23460 Saint-Yrieix-la-Montagne],1.0,[Puy Bessat 23460 Saint-Yrieix-la-Montagne],True,"1.994628,45.871199"


In [28]:
df_inter2.head(3)

Index(['siren', 'siret', 'date_creation', 'tranche_effectif_salarie',
       'activite_principale_registre_metier', 'is_siege', 'complement_adresse',
       'numero_voie', 'indice_repetition', 'type_voie', 'libelle_voie',
       'code_postal', 'libelle_commune', 'commune', 'date_debut_activite',
       'etat_administratif_etablissement', 'enseigne_1', 'enseigne_2',
       'enseigne_3', 'nom_commercial', 'activite_principale', 'longitude',
       'latitude', 'geo_adresse', 'date_creation_entreprise', 'sigle',
       'prenom', 'identifiant_association_unite_legale',
       'tranche_effectif_salarie_entreprise', 'date_mise_a_jour',
       'categorie_entreprise', 'etat_administratif_unite_legale', 'nom',
       'nom_usage', 'nom_raison_sociale', 'nature_juridique_entreprise',
       'activite_principale_entreprise',
       'economie_sociale_solidaire_unite_legale', 'nombre_etablissements',
       'liste_enseigne', 'liste_adresse', 'nombre_etablissements_ouvert',
       'concat_enseigne_adr