In [1]:
import pandas as pd
import requests
from pathlib import Path
import plotly.express as px

In [2]:
data = [
    {
        'id': 'datasets',
        'url': 'https://www.data.gouv.fr/fr/datasets/r/f868cca6-8da1-4369-a78d-47463f19a9a3'
    },
    {
        'id': 'resources',
        'url': 'https://www.data.gouv.fr/fr/datasets/r/4babf5f2-6a9c-45b5-9144-ca5eae6a7a6d'
    },
    {
        'id': 'reuses',
        'url': 'https://www.data.gouv.fr/fr/datasets/r/970aafa0-3778-4d8b-b9d1-de937525e379'
    },
    {
        'id': 'discussions',
        'url': 'https://www.data.gouv.fr/fr/datasets/r/d77705e1-4ecd-461c-8c24-662d47c4c2f9'
    },
    {
        'id': 'organisations',
        'url': 'https://www.data.gouv.fr/fr/datasets/r/b7bbfedc-2448-4135-a6c7-104548d396e7'
    },    
]

def download():
    has_changed = False
    data_path = Path('./data')
    data_path.mkdir(exist_ok=True)
    for item in data:
        r = requests.head(item['url'])
        location = r.headers['Location']
        filename = location.split('/')[-1]
        filepath = data_path / filename
        if not filepath.exists():
            has_changed = True
            r = requests.get(location)
            with open(filepath, 'wb') as dfile:
                dfile.write(r.content)
        item['filepath'] = filepath
    return has_changed

In [3]:
download()

False

In [4]:
datum = [d for d in data if d['id'] == 'datasets'][0]
date_cols = ['created_at', 'last_modified']
datasets = pd.read_csv(datum['filepath'], delimiter=';', parse_dates=date_cols)

datum = [d for d in data if d['id'] == 'organisations'][0]
date_cols = ['created_at', 'last_modified']
organisations = pd.read_csv(datum['filepath'], delimiter=';', parse_dates=date_cols)
organisations.head(1)

Unnamed: 0,id,name,slug,url,description,logo,badges,created_at,last_modified,metric.datasets,metric.members,metric.reuses,metric.followers,metric.views
0,5fad8daa9354256102cef0ca,Réseau national de télécommunications pour la ...,reseau-national-de-telecommunications-pour-la-...,http://www.data.gouv.fr/fr/organizations/resea...,Au service de la communauté Education-Recherch...,https://static.data.gouv.fr/avatars/07/e6142f9...,[],2020-11-12 20:31:54.707,2020-11-13 14:26:51.688,0,2,0,0,0


In [5]:
start_date = '2013-1-1'
d_year = datasets[datasets['created_at'] >= start_date].groupby(pd.Grouper(key="created_at", freq="Y")).count()['id'].rename('datasets')
o_year = organisations[organisations['created_at'] >= start_date].groupby(pd.Grouper(key="created_at", freq="Y")).count()['id'].rename('organisations')
creations = pd.merge(d_year, o_year, right_index=True, left_index=True)

In [9]:
fig = px.bar(creations, x=creations.index.year, y=['datasets', 'organisations'], title='Nombre d\'objets créés par an')
fig.show()

In [7]:
# unreliable creation dates going back to 1995
start_date = '2014-1-1'
d_mod = datasets[datasets['last_modified'] >= start_date].groupby(pd.Grouper(key="last_modified", freq="Y")).count()
fig = px.bar(x=d_mod.index.year, y=d_mod['id'], title='Nombre de jeux de données modifiés par an')
fig.show()

In [8]:
start_date = '2020-1-1'
d_year = datasets[datasets['created_at'] >= start_date].groupby(pd.Grouper(key="created_at", freq="M")).count()
fig = px.bar(x=d_year.index.strftime("%Y-%m"), y=d_year['id'], title='Nombre de jeux de données créés par mois en 2020')
fig.show()