### Imports

In [None]:
import pandas as pd
import glob
import datetime
import os, shutil
import numpy as np
import swifter
import math
import json

### Path set-up

In [None]:
if "DATA_DIR" not in locals():
    DATA_DIR = "./data/"
else:
    print(DATA_DIR)

if os.path.exists(DATA_DIR) and os.path.isdir(DATA_DIR):
    shutil.rmtree(DATA_DIR)
os.makedirs(os.path.dirname(DATA_DIR), exist_ok=True)

In [None]:
if "OUTPUT_DATA_FOLDER" not in locals():
    OUTPUT_DATA_FOLDER = "./output/"
else:
    print(OUTPUT_DATA_FOLDER)

if os.path.exists(OUTPUT_DATA_FOLDER) and os.path.isdir(OUTPUT_DATA_FOLDER):
    shutil.rmtree(OUTPUT_DATA_FOLDER)
os.makedirs(os.path.dirname(OUTPUT_DATA_FOLDER), exist_ok=True)

In [None]:
if "LABELS_FOLDER" not in locals():
    LABELS_FOLDER = "./labels/"
else:
    print(LABELS_FOLDER)

In [None]:
if "ELASTIC_INDEX" not in locals():
    ELASTIC_INDEX = "siren"
else:
    print(ELASTIC_INDEX)

# Établissement

In [None]:
# Create list of departement zip codes
all_deps = [
    *"-0".join(list(str(x) for x in range(0, 10))).split("-")[1:],
    *list(str(x) for x in range(10, 20)),
    *["2A", "2B"],
    *list(str(x) for x in range(21, 96)),
    *"-7510".join(list(str(x) for x in range(0, 10))).split("-")[1:],
    *"-751".join(list(str(x) for x in range(10, 21))).split("-")[1:],
    *["971", "972", "973", "974", "976"],
    *[""],
]
# Remove Paris zip code
all_deps.remove("75")

In [None]:
%%time
# Upload geo data by departement
for dep in all_deps:
    url = "https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_" + dep + ".csv.gz"
    print(url)
    df_dep = pd.read_csv(
        url,
        compression="gzip",
        dtype=str,
        usecols=[
            "siren",
            "siret",
            "dateCreationEtablissement",
            "trancheEffectifsEtablissement",
            "activitePrincipaleRegistreMetiersEtablissement",
            "etablissementSiege",
            "numeroVoieEtablissement",
            "libelleVoieEtablissement",
            "codePostalEtablissement",
            "libelleCommuneEtablissement",
            "libelleCedexEtablissement",
            "typeVoieEtablissement",
            "codeCommuneEtablissement",
            "codeCedexEtablissement",
            "complementAdresseEtablissement",
            "distributionSpecialeEtablissement",
            "complementAdresse2Etablissement",
            "indiceRepetition2Etablissement",
            "libelleCedex2Etablissement",
            "codeCedex2Etablissement",
            "numeroVoie2Etablissement",
            "typeVoie2Etablissement",
            "libelleVoie2Etablissement",
            "codeCommune2Etablissement",
            "libelleCommune2Etablissement",
            "distributionSpeciale2Etablissement",
            "dateDebut",
            "etatAdministratifEtablissement",
            "enseigne1Etablissement",
            "enseigne1Etablissement",
            "enseigne2Etablissement",
            "enseigne3Etablissement",
            "denominationUsuelleEtablissement",
            "activitePrincipaleEtablissement",
            "geo_adresse",
            "geo_id",
            "longitude",
            "latitude",
            "indiceRepetitionEtablissement",
            "libelleCommuneEtrangerEtablissement",
            "codePaysEtrangerEtablissement",
            "libellePaysEtrangerEtablissement",
            "libelleCommuneEtranger2Etablissement",
            "codePaysEtranger2Etablissement",
            "libellePaysEtranger2Etablissement",
        ],
    )
    df_dep = df_dep.rename(
        columns={
            "dateCreationEtablissement": "date_creation",
            "trancheEffectifsEtablissement": "tranche_effectif_salarie",
            "activitePrincipaleRegistreMetiersEtablissement": "activite_principale_registre_metier",
            "etablissementSiege": "is_siege",
            "numeroVoieEtablissement": "numero_voie",
            "typeVoieEtablissement": "type_voie",
            "libelleVoieEtablissement": "libelle_voie",
            "codePostalEtablissement": "code_postal",
            "libelleCedexEtablissement": "libelle_cedex",
            "libelleCommuneEtablissement": "libelle_commune",
            "codeCommuneEtablissement": "commune",
            "complementAdresseEtablissement": "complement_adresse",
            "complementAdresse2Etablissement": "complement_adresse_2",
            "numeroVoie2Etablissement": "numero_voie_2",
            "indiceRepetition2Etablissement": "indice_repetition_2",
            "typeVoie2Etablissement": "type_voie_2",
            "libelleVoie2Etablissement": "libelle_voie_2",
            "codeCommune2Etablissement": "commune_2",
            "libelleCommune2Etablissement": "libelle_commune_2",
            "codeCedex2Etablissement": "cedex_2",
            "libelleCedex2Etablissement": "libelle_cedex_2",
            "codeCedexEtablissement": "cedex",
            "dateDebut": "date_debut_activite",
            "distributionSpecialeEtablissement": "distribution_speciale",
            "distributionSpeciale2Etablissement": "distribution_speciale_2",
            "etatAdministratifEtablissement": "etat_administratif_etablissement",
            "enseigne1Etablissement": "enseigne_1",
            "enseigne2Etablissement": "enseigne_2",
            "enseigne3Etablissement": "enseigne_3",
            "activitePrincipaleEtablissement": "activite_principale",
            "indiceRepetitionEtablissement": "indice_repetition",
            "denominationUsuelleEtablissement": "nom_commercial",
            "libelleCommuneEtrangerEtablissement": "libelle_commune_etranger",
            "codePaysEtrangerEtablissement": "code_pays_etranger",
            "libellePaysEtrangerEtablissement": "libelle_pays_etranger",
            "libelleCommuneEtranger2Etablissement": "libelle_commune_etranger_2",
            "codePaysEtranger2Etablissement": "code_pays_etranger_2",
            "libellePaysEtranger2Etablissement": "libelle_pays_etranger_2",
        }
    )
    df_dep.to_csv(DATA_DIR + "geo_siret_" + dep + ".csv", index=False)

In [None]:
# Get geo data file paths
geo_files = glob.glob(DATA_DIR + "geo_siret*.csv")

In [None]:
geo_files.sort()

In [None]:
%%time
# Compute nbre d'établissements' per 'siren'
df_out = pd.DataFrame()
for geo_file in geo_files:
    print(geo_file)
    df_geo = pd.read_csv(geo_file, dtype=str)
    df_geo = df_geo.replace({np.nan: None})
    df_geo["file"] = geo_file
    # Create column with list of "enseignes" and "nom_commercial"
    df_geo["enseigne"] = df_geo.apply(
        lambda x: list(
            filter(
                None,
                set(
                    [
                        x["enseigne_1"],
                        x["enseigne_2"],
                        x["enseigne_3"],
                        x["nom_commercial"],
                    ]
                ),
            )
        ),
        axis=1,
    )
    df_geo["nombre_etablissements"] = df_geo.groupby(["siren", "file"])[
        "siret"
    ].transform("count")
    df_enseigne = (
        df_geo.groupby(["siren", "file"])["enseigne"]
        .apply(list)
        .reset_index(name="liste_enseigne_dep")
        .drop(columns=["file"], axis=1)
    )
    df_enseigne["liste_enseigne_dep"] = df_enseigne.apply(
        lambda x: list(set(c for b in x.liste_enseigne_dep for c in b)), axis=1
    )
    df_geo = df_geo.merge(df_enseigne, left_on="siren", right_on="siren")
    df_adresse = (
        df_geo.groupby(["siren", "file"])["geo_adresse"]
        .apply(set)
        .reset_index(name="liste_adresse_dep")
        .drop(columns=["file"], axis=1)
    )
    df_geo = df_geo.merge(df_adresse, left_on="siren", right_on="siren")
    df_inter = df_geo[
        [
            "siren",
            "file",
            "nombre_etablissements",
            "liste_enseigne_dep",
            "liste_adresse_dep",
        ]
    ]
    df_out = pd.concat([df_out, df_inter])

In [None]:
df_out.head(5)

In [None]:
df_out = df_out.drop_duplicates(subset=["siren", "file"], keep="first")
df_liste = (
    df_out.groupby(["siren"])["liste_enseigne_dep"]
    .apply(list)
    .reset_index(name="liste_enseigne")
)
df_out = df_out.merge(df_liste, left_on="siren", right_on="siren")
df_liste = (
    df_out.groupby(["siren"])["liste_adresse_dep"]
    .apply(list)
    .reset_index(name="liste_adresse")
)
df_out = df_out.merge(df_liste, left_on="siren", right_on="siren")

In [None]:
df_out2 = (
    df_out[["siren", "nombre_etablissements"]].groupby(["siren"], as_index=False).sum()
)

In [None]:
df_out2 = df_out2.merge(
    df_out[["liste_enseigne", "liste_adresse", "siren"]], on="siren", how="left"
)
df_out2 = df_out2.drop_duplicates(subset=["siren"], keep="first")

In [None]:
df_out2["liste_enseigne"] = df_out2.apply(
    lambda x: list(set(c for b in x.liste_enseigne for c in b)), axis=1
)
df_out2["liste_adresse"] = df_out2.apply(
    lambda x: list(set(c for b in x.liste_adresse for c in b)), axis=1
)

In [None]:
df_out2.head(3)

# Unité Légale

In [None]:
%%time
# Import Stock Unite Legale data
df_unite_legale = pd.read_csv(
    "https://files.data.gouv.fr/insee-sirene/StockUniteLegale_utf8.zip",
    compression="zip",
    dtype=str,
    usecols=[
        "siren",
        "dateCreationUniteLegale",
        "sigleUniteLegale",
        "prenom1UniteLegale",
        "identifiantAssociationUniteLegale",
        "trancheEffectifsUniteLegale",
        "dateDernierTraitementUniteLegale",
        "categorieEntreprise",
        "etatAdministratifUniteLegale",
        "nomUniteLegale",
        "nomUsageUniteLegale",
        "denominationUniteLegale",
        "categorieJuridiqueUniteLegale",
        "activitePrincipaleUniteLegale",
        "economieSocialeSolidaireUniteLegale",
    ],
)

In [None]:
%%time
# Rename columns
df_unite_legale = df_unite_legale.rename(
    columns={
        "dateCreationUniteLegale": "date_creation_unite_legale",
        "sigleUniteLegale": "sigle",
        "prenom1UniteLegale": "prenom",
        "trancheEffectifsUniteLegale": "tranche_effectif_salarie_unite_legale",
        "dateDernierTraitementUniteLegale": "date_mise_a_jour_unite_legale",
        "categorieEntreprise": "categorie_entreprise",
        "etatAdministratifUniteLegale":"etat_administratif_unite_legale",
        "nomUniteLegale": "nom",
        "nomUsageUniteLegale": "nom_usage",
        "denominationUniteLegale": "nom_raison_sociale",
        "categorieJuridiqueUniteLegale": "nature_juridique_unite_legale",
        "activitePrincipaleUniteLegale": "activite_principale_unite_legale",
        "economieSocialeSolidaireUniteLegale":"economie_sociale_solidaire_unite_legale",
        "identifiantAssociationUniteLegale":"identifiant_association_unite_legale",
    }
)


### Data preprocessing

In [None]:
def nom_complet(x):
    if x["nature_juridique_unite_legale"] == "1000":
        if x["sigle"] == x["sigle"]:
            if (x["prenom"] == x["prenom"]) & (x["nom"] == x["nom"]):
                if x["nom usage"] == x["nom_usage"]:
                    return (
                        x["prenom"].lower()
                        + " "
                        + x["nom_usage"].lower()
                        + " ("
                        + x["nom"].lower()
                        + ", "
                        + x["sigle"].lower()
                        + ")"
                    )
                else:
                    return (
                        x["prenom"].lower()
                        + " "
                        + x["nom"].lower()
                        + " ("
                        + x["sigle"].lower()
                        + ")"
                    )
            else:
                return None
        else:
            if (x["prenom"] == x["prenom"]) & (x["nom"] == x["nom"]):
                if x["nom_usage"] == x["nom_usage"]:
                    return (
                        x["prenom"].lower()
                        + " "
                        + x["nom_usage"].lower()
                        + " ("
                        + x["nom"].lower()
                        + ")"
                    )
                else:
                    return x["prenom"].lower() + " " + x["nom"].lower()
            else:
                return None
    else:
        if x["sigle"] == x["sigle"]:
            if x["nom_raison_sociale"] == x["nom_raison_sociale"]:
                return x["nom_raison_sociale"].lower() + " (" + x["sigle"].lower() + ")"
            else:
                return None
        else:
            if x["nom_raison_sociale"] == x["nom_raison_sociale"]:
                return x["nom_raison_sociale"].lower()
            else:
                return None

In [None]:
%%time
# Add nom_complet column to df_unite_legale
# swifter.allow_dask_on_strings()
df_unite_legale["nom_complet"] = df_unite_legale.apply(
    lambda row: nom_complet(row), axis=1
)

# Merge unité légale et établissements

In [None]:
df_unite_legale = pd.merge(df_unite_legale, df_out2, on="siren", how="left")

In [None]:
df_unite_legale.head(3)

# Sections Codes NAF

In [None]:
def load_file(file_name: str):
    with open(file_name) as json_file:
        file_decoded = json.load(json_file)
    return file_decoded
# sections_NAF = load_file(f"{LABELS_FOLDER}sections_codes_naf.json")

In [None]:
sections_NAF = {
"01":"A","02":"A","03":"A","05":"B","06":"B","07":"B","08":"B","09":"B","10":"C","11":"C","12":"C","13":"C","14":"C",
 "15":"C","16":"C","17":"C","18":"C","19":"C","20":"C","21":"C","22":"C","23":"C","24":"C","25":"C","26":"C","27":"C",
 "28":"C","29":"C","30":"C","31":"C","32":"C","33":"C","35":"D","36":"E","37":"E","38":"E","39":"E","41":"F","42":"F",
 "43":"F","45":"G","46":"G","47":"G","49":"H","50":"H","51":"H","52":"H","53":"H","55":"I","56":"I","58":"J","59":"J",
 "60":"J","61":"J","62":"J","63":"J","64":"K","65":"K","66":"K","68":"L","69":"M","70":"M","71":"M","72":"M","73":"M",
 "74":"M","75":"M","77":"N","78":"N","79":"N","80":"N","81":"N","82":"N","84":"O","85":"P","86":"Q","87":"Q","88":"Q",
 "90":"R","91":"R","92":"R","93":"R","94":"S","95":"S","96":"S","97":"T","98":"T","99":"U"
}

# Nombre établissements

In [None]:
# Compute 'nombre etablissements ouverts' per 'siren'
df_out = pd.DataFrame()
for geo_file in geo_files:
    print(geo_file)
    df_geo = pd.read_csv(geo_file, dtype=str)
    df_geo = df_geo[df_geo["etat_administratif_etablissement"] == "A"]
    df_geo["file"] = geo_file
    df_geo["nombre_etablissements_ouverts"] = df_geo.groupby(["siren", "file"])[
        "siret"
    ].transform("count")
    df_inter = df_geo[["siren", "file", "nombre_etablissements_ouverts"]]
    df_out = pd.concat([df_out, df_inter])
df_out = df_out.drop_duplicates(keep="first")
df_out2 = (
    df_out[["siren", "nombre_etablissements_ouverts"]]
    .groupby(["siren"], as_index=False)
    .sum()
)
df_unite_legale = pd.merge(df_unite_legale, df_out2, on="siren", how="left")

In [None]:
df_unite_legale["section_activite_principale"] = df_unite_legale['activite_principale_unite_legale'].str[:2].map(sections_NAF)

In [None]:
def adresse_complete(row):
    col_list = ["complement_adresse", "numero_voie", "indice_repetition", "type_voie", "libelle_voie", "distribution_speciale"]
    adresse = ""
    for column in col_list:
        adresse = adresse + (" " + str(row[column]) if row[column] else "")
    if row["cedex"] is None:
        if row["commune"] is None:
            adresse =  adresse
        else:
            adresse = adresse + " " + str(row["commune"]) + " " + str(row["libelle_commune"])
    else:
        adresse = adresse + " " + str(row["cedex"]) + " " + str(row["libelle_cedex"])
    etranger_list = ["libelle_commune_etranger", "libelle_pays_etranger"]
    for column in etranger_list:
        adresse = adresse + (" " + str(row[column]) if row[column] else "")
    return adresse.strip()

In [None]:
def adresse_complete_2(row):
    col_list = ["complement_adresse_2", "numero_voie_2", "indice_repetition_2", "type_voie_2", "libelle_voie_2", "distribution_speciale_2"]
    adresse = ""
    for column in col_list:
        adresse = adresse + (" " + str(row[column]) if row[column] else "")
    if row["cedex_2"] is None:
        if row["commune_2"] is None:
            adresse = adresse
        else:
            adresse = adresse + " " + str(row["commune_2"]) + " " + str(row["libelle_commune_2"])
    else:
        adresse = adresse + " " + str(row["cedex_2"]) + " " + str(row["libelle_cedex_2"])
    etranger_list = ["libelle_commune_etranger_2", "libelle_pays_etranger_2"]
    for column in etranger_list:
        adresse = adresse + (" " + str(row[column]) if row[column] else "")
    return adresse.strip()

In [None]:
# Merge geo files with above dataframe and add is_entrepreneur_individuel
for geo_file in geo_files:
    print(geo_file)
    df_geo = pd.read_csv(geo_file, dtype=str)
    df_inter = pd.merge(df_geo, df_unite_legale, on="siren", how="left")
    df_inter2 = df_inter[df_inter["is_siege"] == "true"]
    df_inter2 = df_inter2.replace({np.nan: None})
    df_inter2["concat_nom_adr_siren"] = (
        df_inter2["nom_complet"]
        + " "
        + df_inter2["geo_adresse"]
        + " "
        + df_inter2["siren"]
    )
   
    # df_inter2['concat_enseigne_adresse'] = df_inter2.apply(lambda x: set.union(x.liste_enseigne, x.liste_adresse), axis=1)
    df_inter2["concat_enseigne_adresse"] = (
        df_inter2["liste_enseigne"] + df_inter2["liste_adresse"]
    )
    
    df_inter2["is_entrepreneur_individuel"] = df_inter2.apply(lambda x: True if x.nature_juridique_unite_legale in ['1', '10', '1000'] else False, axis=1) # entrepreneur individuel
    df_inter2["coordonnees"] = df_inter2.apply(lambda x: None if ((x.latitude is None) or (x.longitude is None)) else (x.latitude + "," + x.longitude) , axis=1)
    df_inter2['nombre_etablissements_ouverts'] = df_inter2['nombre_etablissements_ouverts'].replace({np.nan: 0})
    df_inter2['nombre_etablissements'] = df_inter2['nombre_etablissements'].replace({np.nan: 1})
    df_inter2['nombre_etablissements'] = df_inter2['nombre_etablissements'].astype(int)
    df_inter2['nombre_etablissements_ouverts'] = df_inter2['nombre_etablissements_ouverts'].astype(int)
    df_inter2['departement'] = df_inter2.apply(lambda x: str(x.commune)[:3] if str(x.commune)[:2]=="97" else (None if x.commune is None else str(x.commune)[:2]), axis=1)
    df_inter2.drop(columns='is_siege', axis=1, inplace=True)
    df_inter2['adresse_etablissement'] = df_inter2.apply(lambda x: adresse_complete(x), axis=1)
    df_inter2['adresse_etablissement_2'] = df_inter2.apply(lambda x: adresse_complete_2(x), axis=1)
    df_inter2['is_entrepreneur_individuel'] = df_inter2['is_entrepreneur_individuel'].map({True: 'true', False: 'false'}) # Elastic only takes 'true' and 'false' as bool
    df_inter2 = df_inter2.rename(
        columns={
            "activite_principale": "activite_principale_siege",
            "date_creation": "date_creation_siege",
            "date_debut_activite": "date_debut_activite_siege",
            "etat_administratif_etablissement": "etat_administratif_siege",
            "siret": "siret_siege",
            "tranche_effectif_salarie": "tranche_effectif_salarie_siege",
        }
    )
    df_inter2.drop(
        columns=[
            "complement_adresse_2", "numero_voie_2", "indice_repetition_2", "type_voie_2",
            "libelle_voie_2", "distribution_speciale_2", "cedex_2", "libelle_commune_2",
            "commune_2","libelle_cedex_2","libelle_commune_etranger_2", "code_pays_etranger_2", "libelle_pays_etranger_2"], axis=1, inplace=True)
    '''
    df_inter.to_csv(
        OUTPUT_DATA_FOLDER + "siret_" + geo_file.replace(DATA_DIR + "geo_siret_", ""),
        index=False,
    )
    '''
    df_inter2.to_csv(
        OUTPUT_DATA_FOLDER
        + ELASTIC_INDEX
        + "_"
        + geo_file.replace(DATA_DIR + "geo_siret_", ""),
        index=False,
    )

In [None]:
df_inter2.head(5)

In [None]:
df_inter2.columns