In [None]:
import pandas as pd
import os, shutil
from datetime import datetime
import json

In [None]:
if "OUTPUT_DATA_FOLDER" not in locals():
    OUTPUT_DATA_FOLDER = "./output/"
else:
    print(OUTPUT_DATA_FOLDER)

if os.path.exists(OUTPUT_DATA_FOLDER) and os.path.isdir(OUTPUT_DATA_FOLDER):
    shutil.rmtree(OUTPUT_DATA_FOLDER)
os.makedirs(os.path.dirname(OUTPUT_DATA_FOLDER), exist_ok=True)

In [None]:
alldeps = [
    *"-0".join(list(str(x) for x in range(0, 10))).split("-")[1:],
    *list(str(x) for x in range(10, 20)),
    *["2A", "2B"],
    *list(str(x) for x in range(21, 95)),
    *"-7510".join(list(str(x) for x in range(0, 10))).split("-")[1:],
    *"-751".join(list(str(x) for x in range(10, 21))).split("-")[1:],
    *[""],
]
alldeps.remove("75")

In [None]:
def gb_column(df, column, dep):
    df = (
        df.groupby(column, as_index=False)
        .count()[[column, "siren"]]
        .sort_values(by="siren", ascending=False)
    )
    df["dep"] = dep
    return df

In [None]:
today = datetime.today().strftime("%Y-%m-%d")

In [None]:
for dep in alldeps:
    print(dep)
    df = pd.read_csv(
        "https://object.files.data.gouv.fr/opendata/dag_datalake_sirene/insert-elk-sirene/"
        + today
        + "/format_sirene_notebook/output/siren_"
        + dep
        + ".csv",
        dtype=str,
    )
    df = df[df["etat_administratif_etablissement"] == "A"]
    dfinter = gb_column(df, "nature_juridique_entreprise", dep)
    dfinter.to_csv(
        OUTPUT_DATA_FOLDER + "/nature_juridique_entreprise_dep_" + dep + ".csv",
        index=False,
    )
    dfinter = gb_column(df, "activite_principale_entreprise", dep)
    dfinter.to_csv(
        OUTPUT_DATA_FOLDER + "/activite_principale_entreprise_dep_" + dep + ".csv",
        index=False,
    )

In [None]:
def gb_column(df, column, dep):
    df = (
        df.groupby(column, as_index=False)
        .count()[[column, "siren"]]
        .sort_values(by="siren", ascending=False)
    )
    df["dep"] = dep
    return df

In [None]:
deps = pd.read_csv(
    "https://raw.githubusercontent.com/etalab/data-covid19-dashboard-widgets/master/utils/departement2021.csv",
    dtype=str,
)
deps = deps[["DEP", "REG"]]
deps = deps.rename(columns={"DEP": "dep", "REG": "reg"})

In [None]:
df = pd.read_csv(
    OUTPUT_DATA_FOLDER + "/activite_principale_entreprise_dep_" + alldeps[0] + ".csv",
    dtype=str,
)
for dep in alldeps[1:]:
    dfinter = pd.read_csv(
        OUTPUT_DATA_FOLDER + "/activite_principale_entreprise_dep_" + dep + ".csv",
        dtype=str,
    )
    df = pd.concat([df, dfinter])

In [None]:
df["siren"] = df["siren"].astype(float)
df = pd.merge(df, deps, on="dep", how="left")
nafs = pd.read_csv(
    "https://raw.githubusercontent.com/etalab/dashboard-aides-entreprises/master/utils/naf_complet.csv",
    dtype=str,
)
catnafs = ["code_sous_classe", "code_division", "code_section"]
nafs = nafs[catnafs]
df = df.rename(columns={"activite_principale_entreprise": "code_sous_classe"})
df = pd.merge(df, nafs, on="code_sous_classe", how="left")

In [None]:
def rm_and_create_folder(folder):
    if os.path.exists(folder) and os.path.isdir(folder):
        shutil.rmtree(folder)
    os.makedirs(os.path.dirname(folder), exist_ok=True)

In [None]:
rm_and_create_folder(OUTPUT_DATA_FOLDER + "naf/")

In [None]:
for cn in catnafs:
    print(cn)
    rm_and_create_folder(OUTPUT_DATA_FOLDER + "naf/" + cn.replace("code_", "") + "/")
    for item in df[df[cn].notna()][cn].unique():
        print(item)
        output = {}
        output["nom"] = (
            "Nombre d'entreprises pour la "
            + cn.replace("code_", "")
            + " naf "
            + str(item)
        )
        output["unite"] = "entreprises"
        output["unite_short"] = "entreprises"
        output["france"] = []
        obj = {}
        obj["level"] = "fra"
        obj["code_level"] = "fra"
        obj["last_date"] = today
        obj["last_value"] = str(df[df[cn] == item]["siren"].sum())
        output["france"].append(obj)
        output["regions"] = []
        output["departements"] = []
        for reg in df[df["reg"].notna()]["reg"].unique():
            obj = {}
            obj["level"] = "reg"
            obj["code_level"] = reg
            obj["last_date"] = today
            obj["last_value"] = str(
                df[(df[cn] == item) & (df["reg"] == reg)].sum()["siren"]
            )
            output["regions"].append(obj)
        for dep in df[df["dep"].notna()]["dep"].unique():
            obj = {}
            obj["level"] = "dep"
            obj["code_level"] = dep
            obj["last_date"] = today
            obj["last_value"] = str(
                df[(df[cn] == item) & (df["dep"] == dep)].sum()["siren"]
            )
            output["departements"].append(obj)
        with open(
            OUTPUT_DATA_FOLDER
            + "naf/"
            + cn.replace("code_", "")
            + "/"
            + item
            + ".json",
            "w",
        ) as f:
            json.dump(output, f)