In [3]:
import pandas as pd
import requests
from pathlib import Path
import plotly.express as px

In [15]:
data = [
    {
        'id': 'datasets',
        'url': 'https://www.data.gouv.fr/fr/datasets/r/f868cca6-8da1-4369-a78d-47463f19a9a3',
        'date_cols': ['created_at', 'last_modified'],
    },
    {
        'id': 'resources',
        'url': 'https://www.data.gouv.fr/fr/datasets/r/4babf5f2-6a9c-45b5-9144-ca5eae6a7a6d',
        'date_cols': ['created_at', 'modified'],
    },
    {
        'id': 'reuses',
        'url': 'https://www.data.gouv.fr/fr/datasets/r/970aafa0-3778-4d8b-b9d1-de937525e379',
        'date_cols': ['created_at', 'last_modified'],
    },
    {
        'id': 'discussions',
        'url': 'https://www.data.gouv.fr/fr/datasets/r/d77705e1-4ecd-461c-8c24-662d47c4c2f9',
        'date_cols': ['created', 'closed'],
    },
    {
        'id': 'organisations',
        'url': 'https://www.data.gouv.fr/fr/datasets/r/b7bbfedc-2448-4135-a6c7-104548d396e7',
        'date_cols': ['created_at', 'last_modified'],
    },    
]

def download():
    has_changed = False
    data_path = Path('./data')
    data_path.mkdir(exist_ok=True)
    for item in data:
        r = requests.head(item['url'])
        location = r.headers['Location']
        filename = location.split('/')[-1]
        filepath = data_path / filename
        if not filepath.exists():
            has_changed = True
            r = requests.get(location)
            with open(filepath, 'wb') as dfile:
                dfile.write(r.content)
        item['filepath'] = filepath
    return has_changed

In [16]:
download()

False

In [17]:
for datum in data:
    datum['df'] = pd.read_csv(datum['filepath'], delimiter=';', parse_dates=datum['date_cols'])

In [27]:
# compute object creation by year
df_year = None
for datum in data:
    created = datum['date_cols'][0]
    _df = datum['df'].groupby(pd.Grouper(key=created, freq="Y")).count()['id'].rename(datum['id'])
    if df_year is None:
        df_year = _df
    else:
        df_year = pd.merge(df_year, _df, right_index=True, left_index=True)
df_year

Unnamed: 0,datasets,resources,reuses,discussions,organisations
2014-12-31,1638,9801,182,177,341
2015-12-31,5629,2753,409,223,206
2016-12-31,6173,3787,240,241,240
2017-12-31,11086,29810,326,786,411
2018-12-31,5479,31434,290,1337,650
2019-12-31,6121,29682,343,2024,541
2020-12-31,4246,134851,498,2574,326


In [28]:
fig = px.bar(df_year, x=df_year.index.year, y=[d['id'] for d in data], title='Nombre d\'objets créés par an')
fig.show()

In [7]:
start_date = '2014-1-1'
d_mod = datasets[datasets['last_modified'] >= start_date].groupby(pd.Grouper(key="last_modified", freq="Y")).count()
fig = px.bar(x=d_mod.index.year, y=d_mod['id'], title='Nombre de jeux de données modifiés par an')
fig.show()

In [40]:
from datetime import datetime
start_date = datetime.now() - pd.Timedelta(days=365)

# compute object creation by month
df_month = None
for datum in data:
    created = datum['date_cols'][0]
    _df = datum['df'][datum['df'][created] >= start_date].groupby(pd.Grouper(key=created, freq="M")).count()['id'].rename(datum['id'])
    df_month = _df if df_month is None else pd.merge(df_month, _df, right_index=True, left_index=True)

fig = px.bar(df_month, x=df_month.index.strftime("%Y-%m"), y=[d['id'] for d in data], title='Nombre d\'objets créés dans les 12 derniers mois')
fig.show()