### Imports

In [1]:
import pandas as pd
import glob
import datetime
import os, shutil

### Path set-up

In [2]:
if 'DATA_DIR' not in locals():
    DATA_DIR = './data/'
else:
    print(DATA_DIR)

if os.path.exists(DATA_DIR) and os.path.isdir(DATA_DIR):
    shutil.rmtree(DATA_DIR)
os.makedirs(os.path.dirname(DATA_DIR), exist_ok=True)

In [3]:
if 'OUTPUT_DATA_FOLDER' not in locals():
    OUTPUT_DATA_FOLDER = './output/'
else:
    print(OUTPUT_DATA_FOLDER)

if os.path.exists(OUTPUT_DATA_FOLDER) and os.path.isdir(OUTPUT_DATA_FOLDER):
    shutil.rmtree(OUTPUT_DATA_FOLDER)
os.makedirs(os.path.dirname(OUTPUT_DATA_FOLDER), exist_ok=True)

In [4]:
if 'ELASTIC_INDEX' not in locals():
    ELASTIC_INDEX = 'siren'
else:
    print(ELASTIC_INDEX)

### Import data

In [5]:
# Import Stock Unite Legale data
df_unite_legale = pd.read_csv('https://files.data.gouv.fr/insee-sirene/StockUniteLegale_utf8.zip', compression='zip', dtype=str, usecols=['siren', 
       'dateCreationUniteLegale', 'sigleUniteLegale',
       'prenom1UniteLegale','identifiantAssociationUniteLegale', 'trancheEffectifsUniteLegale', 
       'dateDernierTraitementUniteLegale', 'categorieEntreprise','etatAdministratifUniteLegale',
       'nomUniteLegale', 'denominationUniteLegale', 'categorieJuridiqueUniteLegale',
       'activitePrincipaleUniteLegale', 'economieSocialeSolidaireUniteLegale'])

In [6]:
# Rename columns
df_unite_legale = df_unite_legale.rename(columns={
    'dateCreationUniteLegale': 'date_creation_entreprise', 
    'sigleUniteLegale': 'sigle',
    'prenom1UniteLegale': 'prenom',
    'trancheEffectifsUniteLegale': 'tranche_effectif_salarie_entreprise',
    'dateDernierTraitementUniteLegale': 'date_mise_a_jour',
    'categorieEntreprise': 'categorie_entreprise',
    'nomUniteLegale': 'nom',
    'denominationUniteLegale': 'nom_raison_sociale',
    'categorieJuridiqueUniteLegale': 'nature_juridique_entreprise',
    'activitePrincipaleUniteLegale': 'activite_principale_entreprise'
})

### Data preprocessing

In [7]:
 def nom_complet(x):
    if(x['nature_juridique_entreprise'] == '1000'):
        if(x['sigle'] == x['sigle']):
            if((x['prenom'] == x['prenom']) & (x['nom'] == x['nom'])):
                return x['prenom'].lower()+' '+x['nom'].lower()+' ('+x['sigle'].lower()+')'
            else:
                return None
        else:
            if((x['prenom'] == x['prenom']) & (x['nom'] == x['nom'])):
                return x['prenom'].lower()+' '+x['nom'].lower()
            else: 
                return None
    else:
        if(x['sigle'] == x['sigle']):
            if(x['nom_raison_sociale'] == x['nom_raison_sociale']):
                return x['nom_raison_sociale'].lower()+' ('+x['sigle'].lower()+')'
            else:
                return None
        else:
            if(x['nom_raison_sociale'] == x['nom_raison_sociale']):
                return x['nom_raison_sociale'].lower()
            else:
                return None

In [8]:
# Add nom_complet column to df_unite_legale
df_unite_legale['nom_complet'] = df_unite_legale.apply(lambda row: nom_complet(row), axis=1)

In [None]:
# Create list of departement zip codes
all_deps = [*'-0'.join(list(str(x) for x in range(0, 10))).split('-')[1:],
    *list(str(x) for x in range(10,20)),
    *['2A','2B'],
    *list(str(x) for x in range(21,95)),
    *'-7510'.join(list(str(x) for x in range(0, 10))).split('-')[1:],
    *'-751'.join(list(str(x) for x in range(10, 21))).split('-')[1:],
    *['']
    ]
# Remove Paris zip code
all_deps.remove('75')

In [9]:
all_deps=['11', '23', '37', '57', '69']

In [10]:
# Upload geo data by departement
for dep in all_deps:
    url = 'https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_'+dep+'.csv.gz'
    print(url)
    df_dep = pd.read_csv(
            url,
            compression="gzip",
            dtype=str,
            usecols=['siren', 'siret',
           'dateCreationEtablissement', 'trancheEffectifsEtablissement',
           'activitePrincipaleRegistreMetiersEtablissement',
           'etablissementSiege',
           'numeroVoieEtablissement',
           'libelleVoieEtablissement',
           'codePostalEtablissement', 'libelleCommuneEtablissement',
            'codeCommuneEtablissement',
            'dateDebut', 'etatAdministratifEtablissement', 'enseigne1Etablissement',
           'activitePrincipaleEtablissement',
           'geo_adresse', 'longitude', 'latitude', 'indiceRepetitionEtablissement']
        )
    df_dep = df_dep.rename(columns={
            'dateCreationEtablissement': 'date_creation',
            'trancheEffectifsEtablissement': 'tranche_effectif_salarie',
            'activitePrincipaleRegistreMetiersEtablissement': 'activite_principale_registre_metier',
            'etablissementSiege': 'is_siege',
            'numeroVoieEtablissement': 'numero_voie',
            'typeVoieEtablissement': 'type_voie',
            'libelleVoieEtablissement': 'libelle_voie',
            'codePostalEtablissement': 'code_postal',
            'libelleCommuneEtablissement': 'libelle_commune',
            'codeCommuneEtablissement': 'commune',
            'codeCedexEtablissement': 'cedex',
            'dateDebut_x': 'date_debut_activite', 
            'etatAdministratifEtablissement': 'etat_administratif_etablissement',
            'enseigne1Etablissement': 'enseigne',
            'activitePrincipaleEtablissement': 'activite_principale',
            'indiceRepetitionEtablissement': 'indice_repetition'
        
        })
    df_dep.to_csv(DATA_DIR+'geo_siret_'+dep+'.csv',index=False)

https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_11.csv.gz
https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_23.csv.gz
https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_37.csv.gz
https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_57.csv.gz
https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_69.csv.gz


In [11]:
# Get geo data file paths
geo_files = glob.glob(DATA_DIR+"geo_siret*.csv")

In [12]:
geo_files.sort()

In [56]:
# Compute nbre d'établissements' per 'siren'
df_out=pd.DataFrame()
for geo_file in geo_files:
    print(geo_file)
    df_geo = pd.read_csv(geo_file,dtype=str)
    df_geo['file'] = geo_file
    df_geo['nombre_etablissements'] = df_geo.groupby(['siren','file'])['siret'].transform('count')
    df_enseigne = df_geo.groupby(['siren','file'])['enseigne'].apply(set).reset_index(name='liste_enseigne_dep').drop(columns=['file'], axis=1)
    df_geo = df_geo.merge(df_enseigne, left_on='siren', right_on='siren')
    df_adresse = df_geo.groupby(['siren','file'])['geo_adresse'].apply(set).reset_index(name='liste_adresse_dep').drop(columns=['file'], axis=1)
    df_geo = df_geo.merge(df_adresse, left_on='siren', right_on='siren')
    df_inter = df_geo[['siren','file','nombre_etablissements', 'liste_enseigne_dep', 'liste_adresse_dep']]
    df_out = pd.concat([df_out, df_inter])

./data/geo_siret_11.csv
./data/geo_siret_23.csv
./data/geo_siret_37.csv
./data/geo_siret_57.csv
./data/geo_siret_69.csv


In [57]:
df_out = df_out.drop_duplicates(subset=['siren','file'], keep='first')
df_liste = df_out.groupby(['siren'])['liste_enseigne_dep'].apply(list).reset_index(name='liste_enseigne')
df_out = df_out.merge(df_liste, left_on='siren', right_on='siren')
df_liste = df_out.groupby(['siren'])['liste_adresse_dep'].apply(list).reset_index(name='liste_adresse')
df_out = df_out.merge(df_liste, left_on='siren', right_on='siren')

In [58]:
df_out2 = df_out[['siren','nombre_etablissements']].groupby(['siren'],as_index=False).sum()

In [59]:
df_out2 = df_out2.merge(df_out[['liste_enseigne', 'liste_adresse','siren']], on='siren', how='left')
df_out2 = df_out2.drop_duplicates(subset=['siren'], keep='first')

In [None]:
df_out2['liste_enseigne'] = df_out2.apply(lambda x: set(c for b in x.liste_enseigne for c in b), axis=1)
df_out2['liste_adresse'] = df_out2.apply(lambda x: set(c for b in x.liste_adresse for c in b), axis=1)

In [79]:
df_out2

Unnamed: 0,siren,nombre_etablissements,liste_enseigne,liste_adresse
0,005720651,1,{nan},{10 Rue Général Plessier 69002 Lyon}
1,006311617,1,{nan},{Place Mirabeau 37500 Chinon}
2,006350037,1,{nan},{66 Rue Béchevelin 69007 Lyon}
3,006580195,1,{SIDES},{7 Chemin de Genas 69800 Saint-Priest}
4,006970859,1,{nan},{54 Cours Vitton 69006 Lyon}
...,...,...,...,...
1449328,998912307,1,{nan},{30 Rue de Lorquin 57400 Imling}
1449329,998922108,4,"{nan, CHAUSSURES LAITI, SAN MARINA}","{Voie Romaine 57280 Semécourt, 21 Rue de Paris..."
1449330,999990005,6,"{nan, STEF LOGISTIQUE, STEF}","{nan, 14 Rue Marcel Mérieux 69960 Corbas, 67 A..."
1449332,999990468,5,{nan},"{62 Rue de Bonnel 69003 Lyon, 11 Rue de la Rép..."


In [80]:
df_unite_legale = pd.merge(df_unite_legale,df_out2,on='siren',how='left')

In [None]:
# Compute 'nombre etablissements ouvert' per 'siren'
df_out=pd.DataFrame()
for geo_file in geo_files:
    print(geo_file)
    df_geo = pd.read_csv(geo_file,dtype=str)
    df_geo = df_geo[df_geo['etat_administratif_etablissement'] == 'A']
    df_geo['file'] = geo_file
    df_geo['nombre_etablissements_ouvert'] = df_geo.groupby(['siren','file'])['siret'].transform('count')
    df_inter = df_geo[['siren','file','nombre_etablissements_ouvert']]
    df_out = pd.concat([df_out, df_inter])
df_out = df_out.drop_duplicates(keep='first')
df_out2 = df_out[['siren','nombre_etablissements_ouvert']].groupby(['siren'],as_index=False).sum()
df_unite_legale = pd.merge(df_unite_legale,df_out2,on='siren',how='left')

In [None]:
# Merge geo files with above dataframe
for geo_file in geo_files:
    print(geo_file)
    df_geo = pd.read_csv(geo_file,dtype=str)
    df_inter = pd.merge(df_geo,df_unite_legale,on='siren',how='left')
    df_inter2 = df_inter[df_inter['is_siege'] == 'true']
    df_inter2['concat_nom_adr_siren'] = df_inter2['nom_complet'] + ' ' + df_inter2['geo_adresse'] + ' ' + df_inter2['siren']
    df_inter2['concat_enseigne_adresse'] = df_inter2.apply(lambda x: x.liste_enseigne.union(x.liste_adresse), axis=1)
    df_inter.to_csv(OUTPUT_DATA_FOLDER+'siret_'+geo_file.replace(DATA_DIR+'geo_siret_',''), index=False)
    df_inter2.to_csv(OUTPUT_DATA_FOLDER+ELASTIC_INDEX+'_'+geo_file.replace(DATA_DIR+'geo_siret_',''), index=False)