### Imports

In [1]:
import os, shutil
import time
import sqlite3
import csv
from psutil import virtual_memory
import pandas as pd

In [2]:
def mem():
    print(f'used memory : {round(virtual_memory()[3]/(1024*1024*1024)*10)/10}Go')

In [3]:
def stats(): 
    print("--- %s seconds ---" % (time.time() - start_time))
    mem()

### Path set-up

In [4]:
if "DATA_DIR" not in locals():
    DATA_DIR = "./data/"
else:
    print(DATA_DIR)

if os.path.exists(DATA_DIR) and os.path.isdir(DATA_DIR):
    shutil.rmtree(DATA_DIR)
os.makedirs(os.path.dirname(DATA_DIR), exist_ok=True)

In [5]:
if "OUTPUT_DATA_FOLDER" not in locals():
    OUTPUT_DATA_FOLDER = "./output/"
else:
    print(OUTPUT_DATA_FOLDER)

if os.path.exists(OUTPUT_DATA_FOLDER) and os.path.isdir(OUTPUT_DATA_FOLDER):
    shutil.rmtree(OUTPUT_DATA_FOLDER)
os.makedirs(os.path.dirname(OUTPUT_DATA_FOLDER), exist_ok=True)

## SQLITE


In [65]:
!rm sirene.db

In [66]:
connection = sqlite3.connect('sirene.db')

In [67]:
cursor = connection.cursor()

## Unité Légale

In [68]:
cursor.execute('''CREATE TABLE IF NOT EXISTS unite_legale
               (siren,
                date_creation_unite_legale,
                sigle,
                prenom,
                identifiant_association_unite_legale,
                tranche_effectif_salarie_unite_legale,
                date_mise_a_jour_unite_legale,
                categorie_entreprise,
                etat_administratif_unite_legale,
                nom,
                nom_usage,
                nom_raison_sociale,
                nature_juridique_unite_legale,
                activite_principale_unite_legale,
                economie_sociale_solidaire_unite_legale)
                ''')

<sqlite3.Cursor at 0x7f815b1bd3b0>

In [69]:
cursor.execute('''
                CREATE UNIQUE INDEX index_siren
                ON unite_legale (siren);
                ''')

<sqlite3.Cursor at 0x7f815b1bd3b0>

In [70]:
connection.commit()

In [71]:
start_time = time.time()

df_unite_legale = pd.read_csv(
    "https://files.data.gouv.fr/insee-sirene/StockUniteLegale_utf8.zip",
    compression="zip",
    dtype=str,
    usecols=[
        "siren",
        "dateCreationUniteLegale",
        "sigleUniteLegale",
        "prenom1UniteLegale",
        "identifiantAssociationUniteLegale",
        "trancheEffectifsUniteLegale",
        "dateDernierTraitementUniteLegale",
        "categorieEntreprise",
        "etatAdministratifUniteLegale",
        "nomUniteLegale",
        "nomUsageUniteLegale",
        "denominationUniteLegale",
        "categorieJuridiqueUniteLegale",
        "activitePrincipaleUniteLegale",
        "economieSocialeSolidaireUniteLegale",
    ],
)
# Rename columns
df_unite_legale = df_unite_legale.rename(
    columns={
        "dateCreationUniteLegale": "date_creation_unite_legale",
        "sigleUniteLegale": "sigle",
        "prenom1UniteLegale": "prenom",
        "trancheEffectifsUniteLegale": "tranche_effectif_salarie_unite_legale",
        "dateDernierTraitementUniteLegale": "date_mise_a_jour_unite_legale",
        "categorieEntreprise": "categorie_entreprise",
        "etatAdministratifUniteLegale":"etat_administratif_unite_legale",
        "nomUniteLegale": "nom",
        "nomUsageUniteLegale": "nom_usage",
        "denominationUniteLegale": "nom_raison_sociale",
        "categorieJuridiqueUniteLegale": "nature_juridique_unite_legale",
        "activitePrincipaleUniteLegale": "activite_principale_unite_legale",
        "economieSocialeSolidaireUniteLegale":"economie_sociale_solidaire_unite_legale",
        "identifiantAssociationUniteLegale":"identifiant_association_unite_legale",
    }
)
stats()

--- 64.81285119056702 seconds ---
used memory : 20.8Go


In [72]:
start_time = time.time()
df_unite_legale.to_sql("unite_legale", connection, if_exists='append', index=False)
stats()

--- 145.21034264564514 seconds ---
used memory : 20.8Go


In [73]:
for row in cursor.execute('SELECT * FROM unite_legale LIMIT 10;'):
    print(row)

('000325175', '2000-09-26', None, 'THIERRY', None, None, '2019-12-13T13:21:28', 'PME', 'A', 'JANOYER', None, None, '1000', '32.12Z', None)
('001807254', '1972-05-01', None, 'JACQUES-LUCIEN', None, None, '2016-07-10T05:00:06', None, 'C', 'BRETON', None, None, '1000', '85.59A', None)
('005410220', '1954-12-25', None, 'GEORGES', None, None, None, None, 'C', 'WATTEBLED', None, None, '1000', '22.02', None)
('005410345', None, None, 'MICHEL', None, None, None, None, 'C', 'DEBRAY', None, None, '1000', '79.06', None)
('005410394', '1954-12-25', None, 'ROBERT', None, None, None, None, 'C', 'DAULT', None, None, '1000', '64.42', None)
('005410428', '1954-01-01', None, 'RENE', None, None, None, None, 'C', 'DINGEON', None, None, '1000', '70.2C', None)
('005410436', None, None, 'MARCEL', None, None, None, None, 'C', 'CARBONNET', None, None, '1000', '57.11', None)
('005410485', None, None, 'RENE', None, None, None, None, 'C', 'LECRIVAIN', None, None, '1000', '64.42', None)
('005410493', '1954-12-25',

### Add nom_complet

In [74]:
add_full_name = '''ALTER TABLE unite_legale ADD COLUMN nom_complet VARCHAR(45) GENERATED ALWAYS AS
      (case when nature_juridique_unite_legale == '1000'
            then (COALESCE(LOWER(prenom),'') || ' ' || COALESCE(LOWER(nom_usage), '') || ' (' || COALESCE(LOWER(nom), '') || ' ' || COALESCE(LOWER(sigle), '')|| ')')
            else (COALESCE(LOWER(nom_raison_sociale), '') || ' ' || COALESCE(LOWER(sigle), ''))
       end );
       '''
cursor.execute(add_full_name)

<sqlite3.Cursor at 0x7f815b1bd3b0>

In [75]:
connection.commit()

In [76]:
for row in cursor.execute('SELECT * FROM unite_legale LIMIT 10;'):
    print(row)

('000325175', '2000-09-26', None, 'THIERRY', None, None, '2019-12-13T13:21:28', 'PME', 'A', 'JANOYER', None, None, '1000', '32.12Z', None, 'thierry  (janoyer )')
('001807254', '1972-05-01', None, 'JACQUES-LUCIEN', None, None, '2016-07-10T05:00:06', None, 'C', 'BRETON', None, None, '1000', '85.59A', None, 'jacques-lucien  (breton )')
('005410220', '1954-12-25', None, 'GEORGES', None, None, None, None, 'C', 'WATTEBLED', None, None, '1000', '22.02', None, 'georges  (wattebled )')
('005410345', None, None, 'MICHEL', None, None, None, None, 'C', 'DEBRAY', None, None, '1000', '79.06', None, 'michel  (debray )')
('005410394', '1954-12-25', None, 'ROBERT', None, None, None, None, 'C', 'DAULT', None, None, '1000', '64.42', None, 'robert  (dault )')
('005410428', '1954-01-01', None, 'RENE', None, None, None, None, 'C', 'DINGEON', None, None, '1000', '70.2C', None, 'rene  (dingeon )')
('005410436', None, None, 'MARCEL', None, None, None, None, 'C', 'CARBONNET', None, None, '1000', '57.11', None, 

### Add entrepreneur individuel and section activité principale

In [78]:
add_entre_indiv = '''ALTER TABLE unite_legale ADD COLUMN is_entrepreneur_individuel INT GENERATED ALWAYS AS
      (case when nature_juridique_unite_legale in ("1", "10", "1000")
            then 1
            else 0
       end );
       '''
cursor.execute(add_entre_indiv)

<sqlite3.Cursor at 0x7f815b1bd3b0>

In [81]:
for row in cursor.execute('SELECT * FROM unite_legale LIMIT 5;'):
    print(row)

('000325175', '2000-09-26', None, 'THIERRY', None, None, '2019-12-13T13:21:28', 'PME', 'A', 'JANOYER', None, None, '1000', '32.12Z', None, 'thierry  (janoyer )', 1)
('001807254', '1972-05-01', None, 'JACQUES-LUCIEN', None, None, '2016-07-10T05:00:06', None, 'C', 'BRETON', None, None, '1000', '85.59A', None, 'jacques-lucien  (breton )', 1)
('005410220', '1954-12-25', None, 'GEORGES', None, None, None, None, 'C', 'WATTEBLED', None, None, '1000', '22.02', None, 'georges  (wattebled )', 1)
('005410345', None, None, 'MICHEL', None, None, None, None, 'C', 'DEBRAY', None, None, '1000', '79.06', None, 'michel  (debray )', 1)
('005410394', '1954-12-25', None, 'ROBERT', None, None, None, None, 'C', 'DAULT', None, None, '1000', '64.42', None, 'robert  (dault )', 1)


In [93]:
sections_NAF = {
"01":"A","02":"A","03":"A","05":"B","06":"B","07":"B","08":"B","09":"B","10":"C","11":"C","12":"C","13":"C","14":"C",
 "15":"C","16":"C","17":"C","18":"C","19":"C","20":"C","21":"C","22":"C","23":"C","24":"C","25":"C","26":"C","27":"C",
 "28":"C","29":"C","30":"C","31":"C","32":"C","33":"C","35":"D","36":"E","37":"E","38":"E","39":"E","41":"F","42":"F",
 "43":"F","45":"G","46":"G","47":"G","49":"H","50":"H","51":"H","52":"H","53":"H","55":"I","56":"I","58":"J","59":"J",
 "60":"J","61":"J","62":"J","63":"J","64":"K","65":"K","66":"K","68":"L","69":"M","70":"M","71":"M","72":"M","73":"M",
 "74":"M","75":"M","77":"N","78":"N","79":"N","80":"N","81":"N","82":"N","84":"O","85":"P","86":"Q","87":"Q","88":"Q",
 "90":"R","91":"R","92":"R","93":"R","94":"S","95":"S","96":"S","97":"T","98":"T","99":"U"
}

In [94]:
def create_section(activite_principale_unite_legale):
    code_naf = activite_principale_unite_legale[:2]
    section_activite_principale = sections_NAF[code_naf] if code_naf in sections_NAF else None
    return section_activite_principale

In [97]:
connection.create_function("add_section", 1, create_section)

In [89]:
# Add column
create_section_column = '''ALTER TABLE unite_legale
                        ADD COLUMN section_activite_principale
                        ''';
cursor.execute(create_section_column)

<sqlite3.Cursor at 0x7f815b1bd3b0>

In [98]:
# create and execute sql query
add_section = '''
    SELECT add_section(COALESCE(activite_principale_unite_legale,'')), siren
    FROM unite_legale
    WHERE true'''
cursor.execute(add_section)

<sqlite3.Cursor at 0x7f815b1bd3b0>

In [99]:
start_time = time.time()
cursor.executemany('''UPDATE unite_legale SET section_activite_principale = ? WHERE siren=?''', cursor.fetchall())
stats()

--- 159.06145071983337 seconds ---
used memory : 20.8Go


In [None]:
start_time = time.time()
cursor.executemany('''UPDATE unite_legale SET section_activite_principale = ? WHERE siren=?''', cursor.fetchall())
stats()

In [100]:
for row in cursor.execute('SELECT * FROM unite_legale LIMIT 5;'):
    print(row)

('000325175', '2000-09-26', None, 'THIERRY', None, None, '2019-12-13T13:21:28', 'PME', 'A', 'JANOYER', None, None, '1000', '32.12Z', None, 'thierry  (janoyer )', 1, 'C')
('001807254', '1972-05-01', None, 'JACQUES-LUCIEN', None, None, '2016-07-10T05:00:06', None, 'C', 'BRETON', None, None, '1000', '85.59A', None, 'jacques-lucien  (breton )', 1, 'P')
('005410220', '1954-12-25', None, 'GEORGES', None, None, None, None, 'C', 'WATTEBLED', None, None, '1000', '22.02', None, 'georges  (wattebled )', 1, 'C')
('005410345', None, None, 'MICHEL', None, None, None, None, 'C', 'DEBRAY', None, None, '1000', '79.06', None, 'michel  (debray )', 1, 'N')
('005410394', '1954-12-25', None, 'ROBERT', None, None, None, None, 'C', 'DAULT', None, None, '1000', '64.42', None, 'robert  (dault )', 1, 'K')


In [84]:
add_section = f'''ALTER TABLE unite_legale ADD COLUMN section_activite_principale GENERATED ALWAYS AS
                  {sections_NAF}[substr(COALESCE(activite_principale_unite_legale),1,2){[}'''
cursor.execute(add_section)

SyntaxError: f-string: closing parenthesis '}' does not match opening parenthesis '[' (640329609.py, line 2)

## Établissements

In [101]:
# Create list of departement zip codes
all_deps = [
    *"-0".join(list(str(x) for x in range(0, 10))).split("-")[1:],
    *list(str(x) for x in range(10, 20)),
    *["2A", "2B"],
    *list(str(x) for x in range(21, 96)),
    *"-7510".join(list(str(x) for x in range(0, 10))).split("-")[1:],
    *"-751".join(list(str(x) for x in range(10, 21))).split("-")[1:],
    *["971", "972", "973", "974", "976"],
    *[""],
]
# Remove Paris zip code
all_deps.remove("75")

In [30]:
all_deps = ["23"]

In [102]:
%%time
# Upload geo data by departement
for dep in all_deps:
    start_time = time.time()
    url = "https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_" + dep + ".csv.gz"
    print(url)
    df_dep = pd.read_csv(
        url,
        compression="gzip",
        dtype=str,
        usecols=[
            "siren",
            "siret",
            "dateCreationEtablissement",
            "trancheEffectifsEtablissement",
            "activitePrincipaleRegistreMetiersEtablissement",
            "etablissementSiege",
            "numeroVoieEtablissement",
            "libelleVoieEtablissement",
            "codePostalEtablissement",
            "libelleCommuneEtablissement",
            "libelleCedexEtablissement",
            "typeVoieEtablissement",
            "codeCommuneEtablissement",
            "codeCedexEtablissement",
            "complementAdresseEtablissement",
            "distributionSpecialeEtablissement",
            "complementAdresse2Etablissement",
            "indiceRepetition2Etablissement",
            "libelleCedex2Etablissement",
            "codeCedex2Etablissement",
            "numeroVoie2Etablissement",
            "typeVoie2Etablissement",
            "libelleVoie2Etablissement",
            "codeCommune2Etablissement",
            "libelleCommune2Etablissement",
            "distributionSpeciale2Etablissement",
            "dateDebut",
            "etatAdministratifEtablissement",
            "enseigne1Etablissement",
            "enseigne1Etablissement",
            "enseigne2Etablissement",
            "enseigne3Etablissement",
            "denominationUsuelleEtablissement",
            "activitePrincipaleEtablissement",
            "geo_adresse",
            "geo_id",
            "longitude",
            "latitude",
            "indiceRepetitionEtablissement",
            "libelleCommuneEtrangerEtablissement",
            "codePaysEtrangerEtablissement",
            "libellePaysEtrangerEtablissement",
            "libelleCommuneEtranger2Etablissement",
            "codePaysEtranger2Etablissement",
            "libellePaysEtranger2Etablissement",
        ],
    )
    df_dep = df_dep.rename(
        columns={
            "dateCreationEtablissement": "date_creation",
            "trancheEffectifsEtablissement": "tranche_effectif_salarie",
            "activitePrincipaleRegistreMetiersEtablissement": "activite_principale_registre_metier",
            "etablissementSiege": "is_siege",
            "numeroVoieEtablissement": "numero_voie",
            "typeVoieEtablissement": "type_voie",
            "libelleVoieEtablissement": "libelle_voie",
            "codePostalEtablissement": "code_postal",
            "libelleCedexEtablissement": "libelle_cedex",
            "libelleCommuneEtablissement": "libelle_commune",
            "codeCommuneEtablissement": "commune",
            "complementAdresseEtablissement": "complement_adresse",
            "complementAdresse2Etablissement": "complement_adresse_2",
            "numeroVoie2Etablissement": "numero_voie_2",
            "indiceRepetition2Etablissement": "indice_repetition_2",
            "typeVoie2Etablissement": "type_voie_2",
            "libelleVoie2Etablissement": "libelle_voie_2",
            "codeCommune2Etablissement": "commune_2",
            "libelleCommune2Etablissement": "libelle_commune_2",
            "codeCedex2Etablissement": "cedex_2",
            "libelleCedex2Etablissement": "libelle_cedex_2",
            "codeCedexEtablissement": "cedex",
            "dateDebut": "date_debut_activite",
            "distributionSpecialeEtablissement": "distribution_speciale",
            "distributionSpeciale2Etablissement": "distribution_speciale_2",
            "etatAdministratifEtablissement": "etat_administratif_etablissement",
            "enseigne1Etablissement": "enseigne_1",
            "enseigne2Etablissement": "enseigne_2",
            "enseigne3Etablissement": "enseigne_3",
            "activitePrincipaleEtablissement": "activite_principale",
            "indiceRepetitionEtablissement": "indice_repetition",
            "denominationUsuelleEtablissement": "nom_commercial",
            "libelleCommuneEtrangerEtablissement": "libelle_commune_etranger",
            "codePaysEtrangerEtablissement": "code_pays_etranger",
            "libellePaysEtrangerEtablissement": "libelle_pays_etranger",
            "libelleCommuneEtranger2Etablissement": "libelle_commune_etranger_2",
            "codePaysEtranger2Etablissement": "code_pays_etranger_2",
            "libellePaysEtranger2Etablissement": "libelle_pays_etranger_2",
        }
    )
    stats()
    cursor.execute(f'''DROP TABLE IF EXISTS {f"siret_{dep}"}''')
    cursor.execute(f'''CREATE TABLE IF NOT EXISTS {f"siret_{dep}"}
            (siren,
            siret,
            date_creation,
            tranche_effectif_salarie,
            activite_principale_registre_metier,
            is_siege,
            numero_voie,
            type_voie,
            libelle_voie,
            code_postal,
            libelle_cedex,
            libelle_commune,
            commune,
            complement_adresse,
            complement_adresse_2,
            numero_voie_2,
            indice_repetition_2,
            type_voie_2,
            libelle_voie_2,
            commune_2,
            libelle_commune_2,
            cedex_2,
            libelle_cedex_2,
            cedex,
            date_debut_activite,
            distribution_speciale,
            distribution_speciale_2,
            etat_administratif_etablissement,
            enseigne_1,
            enseigne_2,
            enseigne_3,
            activite_principale,
            indice_repetition,
            nom_commercial,
            libelle_commune_etranger,
            code_pays_etranger,
            libelle_pays_etranger,
            libelle_commune_etranger_2,
            code_pays_etranger_2,
            libelle_pays_etranger_2,
            longitude,
            latitude,
            geo_adresse,
            geo_id)
            ''')
    
    start_time = time.time()
    df_dep.to_sql(f"siret_{dep}", connection, if_exists='append', index=False)
    connection.commit()
    stats()

https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_01.csv.gz
--- 1.8313565254211426 seconds ---
used memory : 20.8Go
--- 4.522620677947998 seconds ---
used memory : 20.9Go
https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_02.csv.gz
--- 1.3609952926635742 seconds ---
used memory : 20.8Go
--- 3.1554691791534424 seconds ---
used memory : 20.8Go
https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_03.csv.gz
--- 1.1623854637145996 seconds ---
used memory : 20.8Go
--- 2.612961769104004 seconds ---
used memory : 20.8Go
https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_04.csv.gz
--- 0.8079321384429932 seconds ---
used memory : 20.8Go
--- 1.7960841655731201 seconds ---
used memory : 21.1Go
https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_05.csv.gz
--- 0.9493556022644043 seconds ---
used memory : 20.9Go
--- 1.862804889678955 seconds ---
used memory : 20.8Go
https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_06.csv.gz
--- 6.2082579135894775 seconds ---
used m

In [103]:
for row in cursor.execute('SELECT * FROM siret_23 LIMIT 2;'):
    print(row)

('038822102', '03882210200018', '1997-12-25', None, None, 'true', None, None, None, '23460', None, 'LE MONTEIL-AU-VICOMTE', '23134', None, None, None, None, None, None, None, None, None, None, None, '2008-01-01', None, None, 'A', None, None, None, '81.10Z', None, None, None, None, None, None, None, None, None, None, None, None)
('039016357', '03901635700012', None, None, None, 'true', None, None, None, '23700', None, 'AUZANCES', '23013', None, None, None, None, None, None, None, None, None, None, None, '1997-12-31', None, None, 'F', None, None, None, '70.3C', None, None, None, None, None, None, None, None, None, None, None, None)


### Add enseigne

In [50]:
for dep in all_deps:
    start_time = time.time()
    print(dep)
    add_enseigne = f'''ALTER TABLE {f"siret_{dep}"} ADD COLUMN enseignes GENERATED ALWAYS AS
               (COALESCE(enseigne_1, '') || COALESCE(enseigne_2, ' ') || COALESCE(enseigne_3, ' ') || COALESCE(nom_commercial, ''))
               '''
    cursor.execute(add_enseigne)
    stats()

23
--- 0.004804134368896484 seconds ---
used memory : 19.5Go


In [51]:
for row in cursor.execute('SELECT enseigne_1, enseigne_2, enseigne_3, nom_commercial, enseignes FROM siret_23 LIMIT 10;'):
    print(row)

(None, None, None, None, '  ')
(None, None, None, None, '  ')
(None, None, None, None, '  ')
(None, None, None, None, '  ')
(None, None, None, None, '  ')
('BARREIGE-REYNAUD', None, None, None, 'BARREIGE-REYNAUD  ')
('BARREIGE-REYNAUD', None, None, None, 'BARREIGE-REYNAUD  ')
('BARREIGE REYNAUD', None, None, None, 'BARREIGE REYNAUD  ')
(None, None, None, None, '  ')
('ONET SERVICES MONTLUCON GUERET', None, None, None, 'ONET SERVICES MONTLUCON GUERET  ')


### Create Adresse Compléte

In [52]:
def adresse_complete(complement_adresse, numero_voie, indice_repetition, type_voie, libelle_voie, libelle_commune, libelle_cedex, distribution_speciale, commune, cedex, libelle_commune_etranger, libelle_pays_etranger):    
    col_list = [complement_adresse, numero_voie, indice_repetition, type_voie, libelle_voie, distribution_speciale]
    adresse = ""
    for column in col_list:
        adresse = adresse + " " + column if column is not None else ""
    if cedex=="":
        if commune=="":
            adresse =  adresse
        else:
            adresse = adresse + " " + commune + " " + libelle_commune
    else:
        adresse = adresse + " " + cedex + " " + libelle_cedex
    etranger_list = [libelle_commune_etranger, libelle_pays_etranger]
    for column in etranger_list:
        adresse = adresse + " " + column if column is not None else ""
    return adresse.strip()

In [None]:
def add_columns(commune, longitude, latitude):
    departement = str(commune)[:3] if str(commune)[:2]== "97" else (None if commune is None else str(commune)[:2])
    coordonnees = None if (longitude is None) or (atitude is None) else f"{latitude},{longitude}"
    return 

In [53]:
connection.create_function("adresse_complete", 12, adresse_complete)

In [55]:
# Add column
create_address_column = '''ALTER TABLE siret_23 
                        ADD COLUMN adresse_complete
                        ''';
cursor.execute(create_address_column)

<sqlite3.Cursor at 0x7f815ffaece0>

In [56]:
# Add column
create_address2_column = '''ALTER TABLE siret_23 
                        ADD COLUMN adresse_complete_2''';
cursor.execute(create_address2_column)

<sqlite3.Cursor at 0x7f815ffaece0>

In [57]:
# create and execute sql query
add_adresse_complete = '''
    SELECT adresse_complete(COALESCE(complement_adresse,''),COALESCE(numero_voie,''), COALESCE(indice_repetition,''), COALESCE(type_voie,''),
    COALESCE(libelle_voie,''), COALESCE(libelle_commune,''), COALESCE(libelle_cedex,''), COALESCE(distribution_speciale,''),
    COALESCE(commune,''), COALESCE(cedex,''), COALESCE(libelle_commune_etranger,''), COALESCE(libelle_pays_etranger,'')), 
    
    adresse_complete(COALESCE(complement_adresse_2,''),COALESCE(numero_voie_2,''), COALESCE(indice_repetition_2,''), COALESCE(type_voie_2,''),
    COALESCE(libelle_voie_2,''), COALESCE(libelle_commune_2,''), COALESCE(libelle_cedex_2,''), COALESCE(distribution_speciale_2,''),
    COALESCE(commune_2,''), COALESCE(cedex_2,''), COALESCE(libelle_commune_etranger_2,''), COALESCE(libelle_pays_etranger_2,'')), 
    
    
    
    
    siret
    FROM siret_23
    WHERE true'''

In [58]:
cursor.execute(add_adresse_complete)

<sqlite3.Cursor at 0x7f815ffaece0>

In [59]:
start_time = time.time()
cursor.executemany('''UPDATE siret_23 SET adresse_complete = ?, adresse_complete_2 = ? WHERE siret=?''', cursor.fetchall())
stats()

--- 514.9059031009674 seconds ---
used memory : 19.4Go


In [60]:
for row in cursor.execute('SELECT * FROM siret_23 LIMIT 10;'):
    print(row)

('038822102', '03882210200018', '1997-12-25', None, None, 'true', None, None, None, '23460', None, 'LE MONTEIL-AU-VICOMTE', '23134', None, None, None, None, None, None, None, None, None, None, None, '2008-01-01', None, None, 'A', None, None, None, '81.10Z', None, None, None, None, None, None, None, None, None, None, None, None, '  ', '23134 LE MONTEIL-AU-VICOMTE', '')
('039016357', '03901635700012', None, None, None, 'true', None, None, None, '23700', None, 'AUZANCES', '23013', None, None, None, None, None, None, None, None, None, None, None, '1997-12-31', None, None, 'F', None, None, None, '70.3C', None, None, None, None, None, None, None, None, None, None, None, None, '  ', '23013 AUZANCES', '')
('039027305', '03902730500018', None, None, None, 'true', None, 'RUE', 'DE L ETANG', '23190', None, 'BELLEGARDE-EN-MARCHE', '23020', None, None, None, None, None, None, None, None, None, None, None, '1997-12-31', None, None, 'F', None, None, None, '70.3C', None, None, None, None, None, None, 

In [None]:
for row in cursor.fetchall():
    start_time = time.time()
    print(row)
    cursor.execute('''UPDATE siret_23 SET adresse_complete = ? WHERE siret=?''', row)
    stats()

In [None]:
connection.commit()

In [None]:
for row in cursor.execute('SELECT * FROM siret_94 LIMIT 10;'):
    print(row)

In [None]:
adresse_complete("rue","20","","","","","","","","","","")

## Nombre d'établissements et nombre d'établissements ouverts