In [3]:
import pandas as pd
import requests
from pathlib import Path
import plotly.express as px

In [15]:
data = [
    {
        'id': 'datasets',
        'url': 'https://www.data.gouv.fr/fr/datasets/r/f868cca6-8da1-4369-a78d-47463f19a9a3',
        'date_cols': ['created_at', 'last_modified'],
    },
    {
        'id': 'resources',
        'url': 'https://www.data.gouv.fr/fr/datasets/r/4babf5f2-6a9c-45b5-9144-ca5eae6a7a6d',
        'date_cols': ['created_at', 'modified'],
    },
    {
        'id': 'reuses',
        'url': 'https://www.data.gouv.fr/fr/datasets/r/970aafa0-3778-4d8b-b9d1-de937525e379',
        'date_cols': ['created_at', 'last_modified'],
    },
    {
        'id': 'discussions',
        'url': 'https://www.data.gouv.fr/fr/datasets/r/d77705e1-4ecd-461c-8c24-662d47c4c2f9',
        'date_cols': ['created', 'closed'],
    },
    {
        'id': 'organisations',
        'url': 'https://www.data.gouv.fr/fr/datasets/r/b7bbfedc-2448-4135-a6c7-104548d396e7',
        'date_cols': ['created_at', 'last_modified'],
    },    
]

def download():
    has_changed = False
    data_path = Path('./data')
    data_path.mkdir(exist_ok=True)
    for item in data:
        r = requests.head(item['url'])
        location = r.headers['Location']
        filename = location.split('/')[-1]
        filepath = data_path / filename
        if not filepath.exists():
            has_changed = True
            r = requests.get(location)
            with open(filepath, 'wb') as dfile:
                dfile.write(r.content)
        item['filepath'] = filepath
    return has_changed

In [16]:
download()

False

In [17]:
for datum in data:
    datum['df'] = pd.read_csv(datum['filepath'], delimiter=';', parse_dates=datum['date_cols'])

In [43]:
# compute object creation by year
df_year = None
for datum in data:
    created = datum['date_cols'][0]
    _df = datum['df'].groupby(pd.Grouper(key=created, freq="Y")).count()['id'].rename(datum['id'])
    df_year = _df if df_year is None else pd.merge(df_year, _df, right_index=True, left_index=True)
df_year

Unnamed: 0,datasets,resources,reuses,discussions,organisations
2014-12-31,1638,9801,182,177,341
2015-12-31,5629,2753,409,223,206
2016-12-31,6173,3787,240,241,240
2017-12-31,11086,29810,326,786,411
2018-12-31,5479,31434,290,1337,650
2019-12-31,6121,29682,343,2024,541
2020-12-31,4246,134851,498,2574,326


In [28]:
fig = px.bar(df_year, x=df_year.index.year, y=[d['id'] for d in data], title='Nombre d\'objets créés par an')
fig.show()

In [47]:
from datetime import datetime
start_date = datetime.now() - pd.Timedelta(days=365)
start_date = start_date.replace(day=1, hour=0, minute=0, second=0)

# compute object creation by month
df_month = None
for datum in data:
    created = datum['date_cols'][0]
    _df = datum['df'][datum['df'][created] >= start_date].groupby(pd.Grouper(key=created, freq="M")).count()['id'].rename(datum['id'])
    df_month = _df if df_month is None else pd.merge(df_month, _df, right_index=True, left_index=True)

df_month

Unnamed: 0_level_0,datasets,resources,reuses,discussions,organisations
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-11-30,414,1563,22,175,31
2019-12-31,189,3052,60,147,34
2020-01-31,538,20938,24,229,40
2020-02-29,369,50577,17,189,43
2020-03-31,481,11260,70,332,46
2020-04-30,313,4361,108,363,49
2020-05-31,236,4041,73,254,22
2020-06-30,343,4659,35,224,23
2020-07-31,422,7355,40,227,20
2020-08-31,168,5951,27,190,14


In [48]:
fig = px.bar(df_month, x=df_month.index.strftime("%Y-%m"), y=[d['id'] for d in data], title='Nombre d\'objets créés dans les 12 derniers mois')
fig.show()

In [96]:
# resources geo
# /!\ only an estimate, the url based filter is not exhaustive
df = [d for d in data if d['id'] == 'resources'][0]['df']
df_inspire = df[df['url'].str.contains('files.geo.data.gouv.fr', na=False)]
df_inspire = df_inspire.groupby(pd.Grouper(key='created_at', freq="M")).count()['id'].rename('resources_geo')
fig = px.bar(df_inspire, x=df_inspire.index.strftime("%Y-%m"), y=['resources_geo'])
fig.show()

In [75]:
# resources created by slug all time
df = [d for d in data if d['id'] == 'resources'][0]['df']
pd.DataFrame(df.groupby('dataset.slug').count().sort_values('dataset.id', ascending=False)['dataset.id'].rename('resources_count')[:10])

Unnamed: 0_level_0,resources_count
dataset.slug,Unnamed: 1_level_1
datatourisme-la-base-nationale-des-donnees-du-tourisme-en-open-data,3570
donnees-temps-reel-de-mesure-des-concentrations-de-polluants-atmospheriques-reglementes-1,2161
donnees-essentielles-de-la-commande-publique-fichiers-consolides,751
arretes-de-stationnement,558
pyrenees-atlantiques-bd-adresse-v2-2,555
documents-darchives-du-climat-numerises-releves-dobservations-meteorologiques-quotidiennes-de-france-metropolitaine,551
gironde-bd-adresse-v2-2,541
dordogne-bd-adresse-v2-2,523
donnees-issues-des-campagnes-exceptionnelles-campex-2011-2013-substances-eaux-souterraines-detail-par-parametre,510
etablissements-de-sante-par-ght,486


In [74]:
# resources created by slug 2020
df = [d for d in data if d['id'] == 'resources'][0]['df']
pd.DataFrame(df[df['created_at'] >= '2020-01-01'].groupby('dataset.slug').count().sort_values('dataset.id', ascending=False)['dataset.id'].rename('resources_count')[:10])

Unnamed: 0_level_0,resources_count
dataset.slug,Unnamed: 1_level_1
donnees-temps-reel-de-mesure-des-concentrations-de-polluants-atmospheriques-reglementes-1,2151
datatourisme-la-base-nationale-des-donnees-du-tourisme-en-open-data,1570
donnees-issues-des-campagnes-exceptionnelles-campex-2011-2013-substances-eaux-souterraines-detail-par-parametre,510
donnees-essentielles-de-la-commande-publique-fichiers-consolides,439
evolution-des-haies-arch-entre-2005-et-2009-des-territoires-du-nord-et-du-pas-de-calais-20,408
evolution-des-habitats-naturels-arch-entre-2005-et-2009-des-territoires-du-nord-et-du-pas-16,408
evolution-des-haies-arch-entre-2009-et-2013-des-territoires-du-nord-et-du-pas-de-calais-20,408
haies-arch-2005-des-territoires-du-nord-et-du-pas-de-calais-1,408
evolution-des-haies-arch-entre-2005-et-2013-des-territoires-du-nord-et-du-pas-de-calais-20,402
evolution-des-habitats-naturels-arch-entre-2009-et-2013-des-territoires-du-nord-et-du-pas-22,402


In [78]:
# resources created by slug february 2020
df = [d for d in data if d['id'] == 'resources'][0]['df']
pd.DataFrame(df[(df['created_at'] >= '2020-02-01') & (df['created_at'] < '2020-03-01')].groupby('dataset.slug').count().sort_values('dataset.id', ascending=False)['dataset.id'].rename('resources_count')[:10])

Unnamed: 0_level_0,resources_count
dataset.slug,Unnamed: 1_level_1
base-sirene-de-la-metropole-de-lyon,234
datatourisme-la-base-nationale-des-donnees-du-tourisme-en-open-data,145
ddtm-76-zone-dalea-du-pprn-de-la-vallee-de-seine-boucle-delbeuf,75
plan-de-prevention-du-risque-inondation-de-pont-audemer,74
zone-reglementee-pprt-basf-coatex-20110004-rhone,73
ppr-de-sevignacq-meyracq-64ddtm-20020009-plan-de-prevention-des-risques-naturels-pprn-de-la-commune-de-sevignacq-meyracq-64522-departement-des-pyrenees-atlantiques,68
plu-du-20-07-2015-de-saint-sorlin-de-morestel-38458,66
les-pos-et-plu-numerises-du-departement-de-leure,61
ppr-arette-64ddtm20010010-plan-de-prevention-des-risques-naturels-pprn-de-la-commune-de-arette-64040-departement-des-pyrenees-atlantiques,60
plan-de-prevention-des-risques-naturels-de-la-balme-de-thuy-haute-savoie-approuve-le-03-05-1999,60


In [65]:
# Top 50 page views all time
df = [d for d in data if d['id'] == 'datasets'][0]['df']
pd.DataFrame(df.groupby('slug').sum('metric.views').sort_values('metric.views', ascending=False)[:50]['metric.views'])

Unnamed: 0_level_0,metric.views
slug,Unnamed: 1_level_1
liste-publique-des-organismes-de-formation-l-6351-7-1-du-code-du-travail,159646
service-public-fr-guide-vos-droits-et-demarches-particuliers,126652
associations-reconnues-d-utilite-publique,94226
correspondance-entre-les-codes-postaux-et-codes-insee-des-communes-francaises,58615
competence-territoriale-gendarmerie-et-police-nationales,57438
dates-et-lieux-des-collectes-de-don-du-sang,54538
plan-cadastral-informatise,50615
le-calendrier-scolaire,46630
fichier-fantoir-des-voies-et-lieux-dits,45245
service-public-fr-annuaire-de-l-administration-base-de-donnees-locales,43880


In [66]:
# Top 50 resources downloads all time by jdd
df = [d for d in data if d['id'] == 'resources'][0]['df']
pd.DataFrame(df.groupby('dataset.slug').sum('downloads').sort_values('downloads', ascending=False)[:50]['downloads'])

Unnamed: 0_level_0,downloads
dataset.slug,Unnamed: 1_level_1
liste-publique-des-organismes-de-formation-l-6351-7-1-du-code-du-travail,190863
demandes-de-valeurs-foncieres,109634
base-officielle-des-codes-postaux,103981
donnees-hospitalieres-relatives-a-lepidemie-de-covid-19,88152
le-calendrier-scolaire-format-ical,63485
plan-cadastral-informatise,61164
repertoire-national-des-associations,57394
correspondance-entre-les-codes-postaux-et-codes-insee-des-communes-francaises,46498
decoupage-administratif-communal-francais-issu-d-openstreetmap,44042
nomenclature-combinee-a-8-chiffres-nc8-depuis-2016,40301
