### Imports

In [91]:
import pandas as pd
import glob
import datetime
import os, shutil
import numpy as np
import swifter
import math
import json

### Path set-up

In [92]:
if "DATA_DIR" not in locals():
    DATA_DIR = "./data/"
else:
    print(DATA_DIR)

if os.path.exists(DATA_DIR) and os.path.isdir(DATA_DIR):
    shutil.rmtree(DATA_DIR)
os.makedirs(os.path.dirname(DATA_DIR), exist_ok=True)

./data/


In [93]:
if "OUTPUT_DATA_FOLDER" not in locals():
    OUTPUT_DATA_FOLDER = "./output/"
else:
    print(OUTPUT_DATA_FOLDER)

if os.path.exists(OUTPUT_DATA_FOLDER) and os.path.isdir(OUTPUT_DATA_FOLDER):
    shutil.rmtree(OUTPUT_DATA_FOLDER)
os.makedirs(os.path.dirname(OUTPUT_DATA_FOLDER), exist_ok=True)

./output/


In [94]:
if "LABELS_FOLDER" not in locals():
    LABELS_FOLDER = "./labels/"
else:
    print(LABELS_FOLDER)

./labels/


In [95]:
if "ELASTIC_INDEX" not in locals():
    ELASTIC_INDEX = "siren"
else:
    print(ELASTIC_INDEX)

siren


# Établissement

In [96]:
# Create list of departement zip codes
all_deps = [
    *"-0".join(list(str(x) for x in range(0, 10))).split("-")[1:],
    *list(str(x) for x in range(10, 20)),
    *["2A", "2B"],
    *list(str(x) for x in range(21, 96)),
    *"-7510".join(list(str(x) for x in range(0, 10))).split("-")[1:],
    *"-751".join(list(str(x) for x in range(10, 21))).split("-")[1:],
    *["971", "972", "973", "974", "976"],
    *[""],
]
# Remove Paris zip code
all_deps.remove("75")

In [97]:
#all_deps = ["23", "69"]

In [98]:
import time
start_time = time.time()

In [99]:
from psutil import virtual_memory

def mem():
    print(f'used memory : {round(virtual_memory()[3]/(1024*1024*1024)*10)/10}Go')

In [100]:
def stats(): 
    print("--- %s seconds ---" % (time.time() - start_time))
    mem()

In [None]:
%%time
# Upload geo data by departement

# we can reduce this to only the download time by scrapping pandas entirey and only saving csv on disk
# I didnot do it as it requires to rewrite column names
# should save ~ 4~5min

for dep in all_deps:
    stats()
    url = "https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_" + dep + ".csv.gz"
    print(url)
    df_dep = pd.read_csv(
        url,
        compression="gzip",
        dtype=str,
        usecols=[
            "siren",
            "siret",
            "dateCreationEtablissement",
            "trancheEffectifsEtablissement",
            "activitePrincipaleRegistreMetiersEtablissement",
            "etablissementSiege",
            "numeroVoieEtablissement",
            "libelleVoieEtablissement",
            "codePostalEtablissement",
            "libelleCommuneEtablissement",
            "libelleCedexEtablissement",
            "typeVoieEtablissement",
            "codeCommuneEtablissement",
            "codeCedexEtablissement",
            "complementAdresseEtablissement",
            "distributionSpecialeEtablissement",
            "complementAdresse2Etablissement",
            "indiceRepetition2Etablissement",
            "libelleCedex2Etablissement",
            "codeCedex2Etablissement",
            "numeroVoie2Etablissement",
            "typeVoie2Etablissement",
            "libelleVoie2Etablissement",
            "codeCommune2Etablissement",
            "libelleCommune2Etablissement",
            "distributionSpeciale2Etablissement",
            "dateDebut",
            "etatAdministratifEtablissement",
            "enseigne1Etablissement",
            "enseigne1Etablissement",
            "enseigne2Etablissement",
            "enseigne3Etablissement",
            "denominationUsuelleEtablissement",
            "activitePrincipaleEtablissement",
            "geo_adresse",
            "geo_id",
            "longitude",
            "latitude",
            "indiceRepetitionEtablissement",
            "libelleCommuneEtrangerEtablissement",
            "codePaysEtrangerEtablissement",
            "libellePaysEtrangerEtablissement",
            "libelleCommuneEtranger2Etablissement",
            "codePaysEtranger2Etablissement",
            "libellePaysEtranger2Etablissement",
        ],
    )
    df_dep = df_dep.rename(
        columns={
            "dateCreationEtablissement": "date_creation",
            "trancheEffectifsEtablissement": "tranche_effectif_salarie",
            "activitePrincipaleRegistreMetiersEtablissement": "activite_principale_registre_metier",
            "etablissementSiege": "is_siege",
            "numeroVoieEtablissement": "numero_voie",
            "typeVoieEtablissement": "type_voie",
            "libelleVoieEtablissement": "libelle_voie",
            "codePostalEtablissement": "code_postal",
            "libelleCedexEtablissement": "libelle_cedex",
            "libelleCommuneEtablissement": "libelle_commune",
            "codeCommuneEtablissement": "commune",
            "complementAdresseEtablissement": "complement_adresse",
            "complementAdresse2Etablissement": "complement_adresse_2",
            "numeroVoie2Etablissement": "numero_voie_2",
            "indiceRepetition2Etablissement": "indice_repetition_2",
            "typeVoie2Etablissement": "type_voie_2",
            "libelleVoie2Etablissement": "libelle_voie_2",
            "codeCommune2Etablissement": "commune_2",
            "libelleCommune2Etablissement": "libelle_commune_2",
            "codeCedex2Etablissement": "cedex_2",
            "libelleCedex2Etablissement": "libelle_cedex_2",
            "codeCedexEtablissement": "cedex",
            "dateDebut": "date_debut_activite",
            "distributionSpecialeEtablissement": "distribution_speciale",
            "distributionSpeciale2Etablissement": "distribution_speciale_2",
            "etatAdministratifEtablissement": "etat_administratif_etablissement",
            "enseigne1Etablissement": "enseigne_1",
            "enseigne2Etablissement": "enseigne_2",
            "enseigne3Etablissement": "enseigne_3",
            "activitePrincipaleEtablissement": "activite_principale",
            "indiceRepetitionEtablissement": "indice_repetition",
            "denominationUsuelleEtablissement": "nom_commercial",
            "libelleCommuneEtrangerEtablissement": "libelle_commune_etranger",
            "codePaysEtrangerEtablissement": "code_pays_etranger",
            "libellePaysEtrangerEtablissement": "libelle_pays_etranger",
            "libelleCommuneEtranger2Etablissement": "libelle_commune_etranger_2",
            "codePaysEtranger2Etablissement": "code_pays_etranger_2",
            "libellePaysEtranger2Etablissement": "libelle_pays_etranger_2",
        }
    )
    df_dep.to_csv(DATA_DIR + "geo_siret_" + dep + ".csv", index=False)

--- 0.3817572593688965 seconds ---
used memory : 4.9Go
https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_01.csv.gz
--- 8.914906024932861 seconds ---
used memory : 4.8Go
https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_02.csv.gz
--- 13.169343948364258 seconds ---
used memory : 4.8Go
https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_03.csv.gz
--- 16.420730113983154 seconds ---
used memory : 4.8Go
https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_04.csv.gz
--- 19.047084093093872 seconds ---
used memory : 4.9Go
https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_05.csv.gz
--- 21.360645055770874 seconds ---
used memory : 4.9Go
https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_06.csv.gz
--- 37.93453621864319 seconds ---
used memory : 4.7Go
https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_07.csv.gz
--- 41.40235209465027 seconds ---
used memory : 4.7Go
https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_08.csv.gz
--- 43.67071294784546 secon

In [None]:
# Get geo data file paths
geo_files = glob.glob(DATA_DIR + "geo_siret*.csv")

In [None]:
def adresse_complete(cols, row, adresse_2=False):
    def get(x, default=None): 
        val = row[cols[x]]
        if not val:
            return default
        return val
    
    col_list = ["complement_adresse", "numero_voie", "indice_repetition", "type_voie", "libelle_voie", "distribution_speciale"]
    adresse = ""
    for column in col_list:
        col_label = f'{column}_2' if adresse_2 else column
        
        adresse = adresse + (" " + str(get(column)) if get(column) else "")
        
    if get("cedex") is None:
        if get("commune") is None:
            adresse =  adresse
        else:
            adresse = adresse + " " + str(get("commune")) + " " + str(get("libelle_commune"))
    else:
        adresse = adresse + " " + str(get("cedex")) + " " + str(get("libelle_cedex"))
    etranger_list = ["libelle_commune_etranger", "libelle_pays_etranger"]
    for column in etranger_list:
        adresse = adresse + (" " + str(get(column)) if get(column) else "")
    return adresse.strip()

In [None]:
def get_key(k, dico, default=0):
    if(k in dico):
        return dico[k]
    
    return default

In [None]:
def parse_etab(cols, row, all_unite_legale, index):
    get = lambda x: row[cols[x]]

    siren = get('siren')
    unite_legale = get_key(siren, all_unite_legale, {})
    
    siret = get('siret')
    etablissement = {}
    etablissement["enseigne"] = ''.join([
                        get("enseigne_1"),
                        get("enseigne_2"),
                        get("enseigne_3"),
                        get("nom_commercial"),
                    ])
    etablissement["adresse"] = get("geo_adresse")
    etablissement["siret"] = siret
    
    unite_legale["etablissements"]={}
    unite_legale["etablissements"][siret] = etablissement
    unite_legale["nombre_etablissements"] = get_key("nombre_etablissements", unite_legale, 0) + 1
    
    isSiege = get('is_siege')
    if isSiege:
        unite_legale['adresse_complete'] = adresse_complete(cols, row)
        unite_legale['adresse_complete_2'] = adresse_complete(cols, row, adresse_2=True)
    
    is_etablissement_ouvert = get("etat_administratif_etablissement") == "A"
    
    if is_etablissement_ouvert: 
        unite_legale["nombre_etablissements_ouverts"] = get_key("nombre_etablissements_ouverts", unite_legale, 0) + 1
    
    all_unite_legale[siren] = unite_legale
    
#    if(index % 10000 == 0):
#        print(unite_legale)

In [None]:
%%time
from csv import reader

all_unite_legale = {}

for geo_file in geo_files:
    cols = {}
    with open(geo_file, 'r') as read_obj:
        # pass the file object to reader() to get the reader object
        csv_reader = reader(read_obj)
        # Iterate over each row in the csv using reader object
        for index, row in enumerate(csv_reader):
            if(index==0):
                cols = { val: index for (index, val) in enumerate(row)}
            else:
                parse_etab(cols, row, all_unite_legale, index)
            if(index % 1000000 == 0):
                mem()

In [None]:
for unite_legale in all_unite_legale.values():
    unite_legale["liste_enseigne"] = [etab["enseigne"] for etab in unite_legale["etablissements"].values()]
    unite_legale["liste_adresse"] = [etab["adresse"] for etab in unite_legale["etablissements"].values()]

In [None]:
list(all_unite_legale.values())[0:5]

# Unité Légale

In [None]:
from os.path import exists

unite_file = DATA_DIR + "unite_legales.csv"

unite_legale_file_exists = exists(unite_file)

In [None]:
%%time

# Import Stock Unite Legale data

# same a etablissement we can reduce this to only the download time by scrapping pandas entirey and only saving csv on disk
# I didnot do it as it requires to rewrite column names
# should save ~1-2min

if not unite_legale_file_exists:
    df_unite_legale = pd.read_csv(
        "https://files.data.gouv.fr/insee-sirene/StockUniteLegale_utf8.zip",
        compression="zip",
        dtype=str,
        usecols=[
            "siren",
            "dateCreationUniteLegale",
            "sigleUniteLegale",
            "prenom1UniteLegale",
            "identifiantAssociationUniteLegale",
            "trancheEffectifsUniteLegale",
            "dateDernierTraitementUniteLegale",
            "categorieEntreprise",
            "etatAdministratifUniteLegale",
            "nomUniteLegale",
            "nomUsageUniteLegale",
            "denominationUniteLegale",
            "categorieJuridiqueUniteLegale",
            "activitePrincipaleUniteLegale",
            "economieSocialeSolidaireUniteLegale",
        ],
    )
    # Rename columns
    df_unite_legale = df_unite_legale.rename(
        columns={
            "dateCreationUniteLegale": "date_creation_unite_legale",
            "sigleUniteLegale": "sigle",
            "prenom1UniteLegale": "prenom",
            "trancheEffectifsUniteLegale": "tranche_effectif_salarie_unite_legale",
            "dateDernierTraitementUniteLegale": "date_mise_a_jour_unite_legale",
            "categorieEntreprise": "categorie_entreprise",
            "etatAdministratifUniteLegale":"etat_administratif_unite_legale",
            "nomUniteLegale": "nom",
            "nomUsageUniteLegale": "nom_usage",
            "denominationUniteLegale": "nom_raison_sociale",
            "categorieJuridiqueUniteLegale": "nature_juridique_unite_legale",
            "activitePrincipaleUniteLegale": "activite_principale_unite_legale",
            "economieSocialeSolidaireUniteLegale":"economie_sociale_solidaire_unite_legale",
            "identifiantAssociationUniteLegale":"identifiant_association_unite_legale",
        }
    )
    df_unite_legale.to_csv(unite_file, index=False)

In [None]:
df_unite_legale.head()

In [None]:
stats()


### Data preprocessing

In [None]:
sections_NAF = {
"01":"A","02":"A","03":"A","05":"B","06":"B","07":"B","08":"B","09":"B","10":"C","11":"C","12":"C","13":"C","14":"C",
 "15":"C","16":"C","17":"C","18":"C","19":"C","20":"C","21":"C","22":"C","23":"C","24":"C","25":"C","26":"C","27":"C",
 "28":"C","29":"C","30":"C","31":"C","32":"C","33":"C","35":"D","36":"E","37":"E","38":"E","39":"E","41":"F","42":"F",
 "43":"F","45":"G","46":"G","47":"G","49":"H","50":"H","51":"H","52":"H","53":"H","55":"I","56":"I","58":"J","59":"J",
 "60":"J","61":"J","62":"J","63":"J","64":"K","65":"K","66":"K","68":"L","69":"M","70":"M","71":"M","72":"M","73":"M",
 "74":"M","75":"M","77":"N","78":"N","79":"N","80":"N","81":"N","82":"N","84":"O","85":"P","86":"Q","87":"Q","88":"Q",
 "90":"R","91":"R","92":"R","93":"R","94":"S","95":"S","96":"S","97":"T","98":"T","99":"U"
}

In [None]:
def nom_complet(cols, row):
    def get(x, default=None): 
        val = row[cols[x]]
        if not val:
            return default
        return val

    is_auto_entrepreneur = get("nature_juridique_unite_legale") == "1000"
    
    sigle = get("sigle")
    
    if is_auto_entrepreneur:
        prenom = get("prenom")
        nom = get("nom")
        nom_usage = get("nom_usage", " ")
        formatted_nom_usage = " " + nom_usage.lower() if nom_usage else ""
        formatted_sigle = ", "+ sigle if sigle else ''
        
        if (prenom is None and nom is None):
            return None
        else:
            return f'{prenom}{formatted_nom_usage} ({nom}{formatted_sigle})'.lower()
    else:
        nom_raison_sociale = get("nom_raison_sociale")
        
        if nom_raison_sociale is None and sigle is None:
            return None
        else:
            formatted_sigle = f' ({sigle})' if sigle else ''
            return f'{nom_raison_sociale}{formatted_sigle}'.lower()

In [None]:
def parse_unite(cols, row, all_unite_legale, index):
    def get(x, default=None): 
        val = row[cols[x]]
        if not val:
            return default
        return val

    siren = get('siren')
    unite_legale = get_key(siren, all_unite_legale, {})
    
    unite_legale['nom_complet'] = nom_complet(cols, row)
    activite_principale_unite_legale = get('activite_principale_unite_legale', '')
    code_naf = activite_principale_unite_legale[:2]
    unite_legale["section_activite_principale"] = sections_NAF[code_naf] if code_naf in sections_NAF else None
    
    all_unite_legale[siren] = unite_legale
    
    if(index % 5000000 == 0):
        stats()
        print(index)

In [None]:
%%time

with open(unite_file, 'r') as read_obj:
    cols = {}
    # pass the file object to reader() to get the reader object
    csv_reader = reader(read_obj)
    # Iterate over each row in the csv using reader object
    for index, row in enumerate(csv_reader):
        if(index==0):
            cols = { val: index for (index, val) in enumerate(row)}
        else:
            parse_unite(cols, row, all_unite_legale, index)

In [None]:
list(all_unite_legale.values())[0:5]

In [None]:
len(list(all_unite_legale.values()))

%%time
# Merge geo files with above dataframe and add is_entrepreneur_individuel
for geo_file in geo_files:
    print(geo_file)
    df_geo = pd.read_csv(geo_file, dtype=str)
    df_inter = pd.merge(df_geo, df_unite_legale, on="siren", how="left")
    df_inter2 = df_inter[df_inter["is_siege"] == "true"]
    df_inter2 = df_inter2.replace({np.nan: None})
    df_inter2["concat_nom_adr_siren"] = (
        df_inter2["nom_complet"]
        + " "
        + df_inter2["geo_adresse"]
        + " "
        + df_inter2["siren"]
    )
   
    # df_inter2['concat_enseigne_adresse'] = df_inter2.apply(lambda x: set.union(x.liste_enseigne, x.liste_adresse), axis=1)
    df_inter2["concat_enseigne_adresse"] = (
        df_inter2["liste_enseigne"] + df_inter2["liste_adresse"]
    )
    
    df_inter2["is_entrepreneur_individuel"] = df_inter2.apply(lambda x: True if x.nature_juridique_unite_legale in ['1', '10', '1000'] else False, axis=1) # entrepreneur individuel
    df_inter2["coordonnees"] = df_inter2.apply(lambda x: None if ((x.latitude is None) or (x.longitude is None)) else (x.latitude + "," + x.longitude) , axis=1)
    df_inter2['nombre_etablissements_ouverts'] = df_inter2['nombre_etablissements_ouverts'].replace({np.nan: 0})
    df_inter2['nombre_etablissements'] = df_inter2['nombre_etablissements'].replace({np.nan: 1})
    df_inter2['nombre_etablissements'] = df_inter2['nombre_etablissements'].astype(int)
    df_inter2['nombre_etablissements_ouverts'] = df_inter2['nombre_etablissements_ouverts'].astype(int)
    df_inter2['departement'] = df_inter2.apply(lambda x: str(x.commune)[:3] if str(x.commune)[:2]=="97" else (None if x.commune is None else str(x.commune)[:2]), axis=1)
    df_inter2.drop(columns='is_siege', axis=1, inplace=True)
    df_inter2['adresse_etablissement'] = df_inter2.apply(lambda x: adresse_complete(x), axis=1)
    df_inter2['adresse_etablissement_2'] = df_inter2.apply(lambda x: adresse_complete_2(x), axis=1)
    df_inter2['is_entrepreneur_individuel'] = df_inter2['is_entrepreneur_individuel'].map({True: 'true', False: 'false'}) # Elastic only takes 'true' and 'false' as bool
    df_inter2 = df_inter2.rename(
        columns={
            "activite_principale": "activite_principale_siege",
            "date_creation": "date_creation_siege",
            "date_debut_activite": "date_debut_activite_siege",
            "etat_administratif_etablissement": "etat_administratif_siege",
            "siret": "siret_siege",
            "tranche_effectif_salarie": "tranche_effectif_salarie_siege",
        }
    )
    df_inter2.drop(
        columns=[
            "complement_adresse_2", "numero_voie_2", "indice_repetition_2", "type_voie_2",
            "libelle_voie_2", "distribution_speciale_2", "cedex_2", "libelle_commune_2",
            "commune_2","libelle_cedex_2","libelle_commune_etranger_2", "code_pays_etranger_2", "libelle_pays_etranger_2"], axis=1, inplace=True)
    '''
    df_inter.to_csv(
        OUTPUT_DATA_FOLDER + "siret_" + geo_file.replace(DATA_DIR + "geo_siret_", ""),
        index=False,
    )
    '''
    df_inter2.to_csv(
        OUTPUT_DATA_FOLDER
        + ELASTIC_INDEX
        + "_"
        + geo_file.replace(DATA_DIR + "geo_siret_", ""),
        index=False,
    )

In [None]:
%%time
# df_unite_legale = pd.DataFrame(list(all_unite_legale.values()))
mem()

In [None]:
# df_unite_legale.head()

In [None]:
len(all_unite_legale.values())

In [None]:
stats()