In [63]:
# Imports

import requests
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import calendar
from jours_feries_france import JoursFeries

### Collect Road data

In [64]:
def get_road_data_by_date(year, month, day, libelle, libelle_nd_amont, libelle_nd_aval):

    # API Url
    api_url = "https://opendata.paris.fr/api/explore/v2.1/catalog/datasets/comptages-routiers-permanents/records"

    # Request parameters
    params = {
        "limit": 24,
        "order_by": "t_1h",
        "timezone": "CET",
        "refine": [
            f"t_1h:{year}/{month}/{day}",
            f"libelle_nd_amont:{libelle_nd_amont}",
            f"libelle_nd_aval:{libelle_nd_aval}",
            f"libelle:{libelle}"
        ]
    }

    # Get response
    response = requests.get(api_url, params=params)

    # Fetch data
    if response.status_code == 200:
        records = response.json().get("results", [])
        if records:
            df = pd.DataFrame(columns=["year", "month", "day", "hour", "flow rate", "occupancy rate", "trafic state"])
            for index, record in enumerate(records):
                t_1h = record["t_1h"]
                q = record["q"]
                k = record["k"]
                trafic_state = record["etat_trafic"]
                year = t_1h[:4]
                month = t_1h[5:7]
                day = t_1h[8:10]
                hour = t_1h[11:13]
                
                df.loc[index] = [year, month, day, hour, q, k, trafic_state]
        else:
            print(f"No data found ({year}/{month}/{day}).")
            df = pd.DataFrame(columns=["year", "month", "day", "hour", "flow rate", "occupancy rate", "trafic state"])
    else:
        print(f"API Error : {response.status_code}")
    
    empty_df = pd.DataFrame({"year": year, "month": month, "day": day, "hour": [str(i).zfill(2) for i in range(24)]})

    filled_df = pd.merge(df, empty_df, on=['year', 'month', 'day', 'hour'], how="outer")
    return filled_df

In [65]:
print(get_road_data_by_date("2024", "11", "15", "AV_Champs_Elysees", "Av_Champs_Elysees-Washington", "Av_Champs_Elysees-Berri"))
#print(get_road_data_by_date("2023", "12", "06", "Convention", "Convention-Blomet", "Lecourbe-Convention"))
#print(get_road_data_by_date("2023", "12", "06", "St_Antoine", "Bastille-St_Antoine", "St_Antoine-Jacques_Coeur"))

# q: h flow rate (number of vehicle per hour)
# k: occupancy rate (time spent by a vehicle in one hour, in %, 100% = 1 hour)

    year month day hour  flow rate  occupancy rate trafic state
0   2024    11  15   00        NaN             NaN          NaN
1   2024    11  15   01        NaN             NaN          NaN
2   2024    11  15   02        NaN             NaN          NaN
3   2024    11  15   03        NaN             NaN          NaN
4   2024    11  15   04        NaN             NaN          NaN
5   2024    11  15   05        NaN             NaN          NaN
6   2024    11  15   06        NaN             NaN          NaN
7   2024    11  15   07        NaN             NaN          NaN
8   2024    11  15   08        NaN             NaN          NaN
9   2024    11  15   09        NaN             NaN          NaN
10  2024    11  15   10      254.0         4.09445       Fluide
11  2024    11  15   11        NaN             NaN          NaN
12  2024    11  15   12        NaN             NaN          NaN
13  2024    11  15   13        NaN             NaN          NaN
14  2024    11  15   14        NaN      

In [66]:
def get_road_data_per_month(year, month, libelle, libelle_nd_amont, libelle_nd_aval):

    number_of_days = calendar.monthrange(int(year), int(month))[1]

    dfs = []

    for index in range(number_of_days):
        day = str(index + 1).zfill(2)
        dfs.append(get_road_data_by_date(year, month, day, libelle, libelle_nd_amont, libelle_nd_aval))
    
    df = pd.concat(dfs, ignore_index=True)
    return df

In [67]:
print(get_road_data_per_month("2023", "11", "AV_Champs_Elysees", "Av_Champs_Elysees-Washington", "Av_Champs_Elysees-Berri"))
#print(get_road_data_per_month("2024", "11", "Convention", "Convention-Blomet", "Lecourbe-Convention"))
#print(get_road_data_per_month("2024", "11", "St_Antoine", "Bastille-St_Antoine", "St_Antoine-Jacques_Coeur"))

  df.loc[index] = [year, month, day, hour, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour, q, k, trafic

     year month day hour  flow rate  occupancy rate trafic state
0    2023    11  01   00        NaN             NaN          NaN
1    2023    11  01   01        NaN             NaN          NaN
2    2023    11  01   02        NaN             NaN          NaN
3    2023    11  01   03        NaN             NaN          NaN
4    2023    11  01   04      704.0        11.40389       Fluide
..    ...   ...  ..  ...        ...             ...          ...
715  2023    11  30   19     1457.0        28.16000   Pré-saturé
716  2023    11  30   20     1433.0        30.87000       Saturé
717  2023    11  30   21     1253.0        24.38111   Pré-saturé
718  2023    11  30   22     1069.0        20.40723   Pré-saturé
719  2023    11  30   23      950.0        16.66334   Pré-saturé

[720 rows x 7 columns]


### Collect Weather data

In [23]:
def fetch_weather_data(start, end=None):
    """
    Uses Weather API to collect weather data for a certain day.
    """
    
    params = {
        'date': start, 
        'q': "paris,france",
        'format': "json", 
        'tp': "1", 
        'key': "694c007d0c1b4641a3f155842240312"
        }
    
    if end:
        params['enddate'] = end

    request = requests.get(url="https://api.worldweatheronline.com/premium/v1/past-weather.ashx", params=params)
    json_data = request.json()

    if json_data['data']['weather']:
        return json_data['data']['weather']
    else:
        print("Error when fetching weather data")

In [24]:
def generate_month_ranges(start_year, start_month):
    """
    Generates a list of tuples starting from a certain year. 
    Each tuple contains the first and last days of the month.
    """
    
    current_date = datetime.now()
    start_date = datetime(start_year, start_month, 1)
    month_ranges = []

    while start_date < current_date:
        end_date = (start_date + timedelta(days=32)).replace(day=1) - timedelta(days=1)
        if end_date > current_date:
            end_date = current_date
        month_ranges.append((start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")))
        start_date = end_date + timedelta(days=1)

    return month_ranges

In [25]:
def fetch_all_weather_data(start_year, start_month):
    """
    Collects all weather data starting from a certain year, using Weather API.
    """
    
    month_ranges = generate_month_ranges(start_year, start_month)
    all_weather_data = []

    for start, end in tqdm(month_ranges, desc="Fetching weather data", unit="month"):
        data = fetch_weather_data(start=start, end=end)
        if data:
            all_weather_data.extend(data)

    return all_weather_data

In [26]:
data = fetch_all_weather_data(start_year=2023, start_month=1)

Fetching weather data: 100%|██████████| 24/24 [00:06<00:00,  3.98month/s]


In [49]:
def collect_daily_weather_data(data):
    """
    Generates a dataframe with daily weather data.
    """

    general_data = []  

    for i in range(len(data)):
        general_data.append({
            'year': data[i]['date'][:4], 
            'month': data[i]['date'][5:7],
            'day': data[i]['date'][8:10],
            'maxtempC': data[i]['maxtempC'],
            'mintempC': data[i]['mintempC'],
            'avgtempC': data[i]['avgtempC'],
            'totalSnow_cm': data[i]['totalSnow_cm'],
            'sunHour': data[i]['sunHour'],
            'uvIndex': data[i]['uvIndex']
        })

    return pd.DataFrame(general_data).astype(float)


daily_weather_df = collect_daily_weather_data(data)

In [50]:
def collect_hourly_weather_data(data):
    """
    Generates a dataframe with hourly weather data.
    """

    hourly_data = []  

    for i in range(len(data)):
        for j in range(24):
            hourly_data.append({
                'year': data[i]['date'][:4], 
                'month': data[i]['date'][5:7],
                'day': data[i]['date'][8:10],
                'hour': data[i]['hourly'][j]['time'] , 
                'tempC': data[i]['hourly'][j]['tempC'],
                'windspeedKmph': data[i]['hourly'][j]['windspeedKmph'],
                'winddirDegree': data[i]['hourly'][j]['winddirDegree'],
                'precipMM': data[i]['hourly'][j]['precipMM'],
                'humidity': data[i]['hourly'][j]['humidity'],
                'visibility': data[i]['hourly'][j]['visibility'],
                'pressure': data[i]['hourly'][j]['pressure'],
                'cloudcover': data[i]['hourly'][j]['cloudcover'],
                'HeatIndexC': data[i]['hourly'][j]['HeatIndexC'],
                'DewPointC': data[i]['hourly'][j]['DewPointC'],
                'WindChillC': data[i]['hourly'][j]['WindChillC'],
                'WindGustKmph': data[i]['hourly'][j]['WindGustKmph'],
                'FeelsLikeC': data[i]['hourly'][j]['FeelsLikeC'],
                'uvIndex': data[i]['hourly'][j]['uvIndex'],
            })

    return pd.DataFrame(hourly_data).astype(float)

hourly_weather_df = collect_hourly_weather_data(data)

In [51]:
hourly_weather_df = hourly_weather_df.merge(
    daily_weather_df[['year', 'month', 'day', 'totalSnow_cm']],
    on=['year', 'month', 'day'],
    how='left'
)
hourly_weather_df

Unnamed: 0,year,month,day,hour,tempC,windspeedKmph,winddirDegree,precipMM,humidity,visibility,pressure,cloudcover,HeatIndexC,DewPointC,WindChillC,WindGustKmph,FeelsLikeC,uvIndex,totalSnow_cm
0,2023.0,1.0,1.0,0.0,14.0,27.0,205.0,0.0,61.0,10.0,1015.0,23.0,14.0,6.0,11.0,50.0,11.0,1.0,0.0
1,2023.0,1.0,1.0,100.0,14.0,25.0,204.0,0.0,62.0,10.0,1016.0,31.0,14.0,6.0,11.0,47.0,11.0,1.0,0.0
2,2023.0,1.0,1.0,200.0,13.0,23.0,204.0,0.0,63.0,10.0,1016.0,40.0,13.0,7.0,12.0,44.0,12.0,1.0,0.0
3,2023.0,1.0,1.0,300.0,13.0,22.0,204.0,0.0,64.0,10.0,1016.0,48.0,13.0,7.0,11.0,42.0,11.0,1.0,0.0
4,2023.0,1.0,1.0,400.0,13.0,21.0,205.0,0.0,65.0,10.0,1016.0,41.0,13.0,7.0,11.0,40.0,11.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16891,2024.0,12.0,4.0,1900.0,7.0,10.0,210.0,0.0,72.0,10.0,1028.0,41.0,7.0,2.0,5.0,16.0,5.0,1.0,0.0
16892,2024.0,12.0,4.0,2000.0,7.0,11.0,214.0,0.0,72.0,10.0,1028.0,39.0,7.0,2.0,5.0,18.0,5.0,1.0,0.0
16893,2024.0,12.0,4.0,2100.0,7.0,11.0,216.0,0.0,74.0,10.0,1028.0,27.0,7.0,2.0,4.0,18.0,4.0,1.0,0.0
16894,2024.0,12.0,4.0,2200.0,7.0,12.0,223.0,0.0,74.0,10.0,1028.0,63.0,7.0,2.0,4.0,19.0,4.0,1.0,0.0


### Collect School Holidays + Public Holidays

In [53]:
def get_school_holidays(year):

    # API Url
    api_url = "https://data.education.gouv.fr/api/explore/v2.1/catalog/datasets/fr-en-calendrier-scolaire/records"

    # Request parameters
    params = {
        "limit": -1,
        #"order_by": "t_1h",
        "timezone": "CET",
        "where": f'location="Paris" AND (annee_scolaire = "{str(year-1)}-{str(year)}" OR annee_scolaire = "{str(year)}-{str(year+1)}")'
    }

    # Get response
    response = requests.get(api_url, params=params)

    # Fetch data
    if response.status_code == 200:
        records = response.json().get("results", [])
        if records:
            start = []
            end = []
            for index, record in enumerate(records):
                start_date = record["start_date"]
                syear = int(start_date[:4])
                smonth = int(start_date[5:7])
                sday = int(start_date[8:10])
                end_date = record["end_date"]
                eyear = int(end_date[:4])
                emonth = int(end_date[5:7])
                eday = int(end_date[8:10])
                
                start.append(datetime(syear, smonth, sday))
                end.append(datetime(eyear, emonth, eday))
        else:
            print("No data found.")
    else:
        print(f"API Error : {response.status_code}")

    return start, end

In [54]:
def get_public_holidays(year):
    dates = JoursFeries.for_year(year)
    return list(dates.values())

In [55]:
print(get_school_holidays(2024))
print(get_public_holidays(2024))

([datetime.datetime(2024, 12, 21, 0, 0), datetime.datetime(2025, 4, 12, 0, 0), datetime.datetime(2024, 5, 9, 0, 0), datetime.datetime(2025, 5, 29, 0, 0), datetime.datetime(2025, 7, 5, 0, 0), datetime.datetime(2023, 12, 23, 0, 0), datetime.datetime(2024, 2, 10, 0, 0), datetime.datetime(2024, 4, 6, 0, 0), datetime.datetime(2023, 10, 21, 0, 0), datetime.datetime(2024, 7, 6, 0, 0), datetime.datetime(2024, 7, 6, 0, 0), datetime.datetime(2024, 10, 19, 0, 0), datetime.datetime(2025, 2, 15, 0, 0), datetime.datetime(2025, 7, 5, 0, 0)], [datetime.datetime(2025, 1, 6, 0, 0), datetime.datetime(2025, 4, 28, 0, 0), datetime.datetime(2024, 5, 13, 0, 0), datetime.datetime(2025, 6, 2, 0, 0), datetime.datetime(2025, 8, 29, 0, 0), datetime.datetime(2024, 1, 8, 0, 0), datetime.datetime(2024, 2, 26, 0, 0), datetime.datetime(2024, 4, 22, 0, 0), datetime.datetime(2023, 11, 6, 0, 0), datetime.datetime(2024, 9, 2, 0, 0), datetime.datetime(2024, 8, 30, 0, 0), datetime.datetime(2024, 11, 4, 0, 0), datetime.datet

In [56]:
def get_holidays_df(target_year):
    dates = pd.date_range(start=f"{target_year}-01-01", end=f"{target_year}-12-31", freq="D")

    df = pd.DataFrame({
        "year": dates.year,
        "month": dates.month,
        "day": dates.day,
        "school_holiday": [0] * len(dates),  # Première colonne vide
        "public_holiday": [0] * len(dates)   # Deuxième colonne vide
    })

    start_dates, end_dates = get_school_holidays(target_year)

    for index in range(len(start_dates)):
        sday = start_dates[index]
        eday = end_dates[index]

        sh_days = pd.date_range(start=sday, end=eday, freq="D")
        sh_days_list = sh_days.tolist()

        for sh_day in sh_days_list:
            year, month, day = sh_day.year, sh_day.month, sh_day.day
            if year != target_year:
                continue
            else:
                mask = (df["year"] == year) & (df["month"] == month) & (df["day"] == day)
                df.loc[mask, "school_holiday"] = 1
        
        ph_days = get_public_holidays(target_year)
        for ph_day in ph_days:
            year, month, day = ph_day.year, ph_day.month, ph_day.day
            mask = (df["year"] == year) & (df["month"] == month) & (df["day"] == day)
            df.loc[mask, "public_holiday"] = 1

    return df.astype(float)

In [57]:
holidays_df = pd.concat([get_holidays_df(2023), get_holidays_df(2024)], ignore_index=True)
holidays_df

Unnamed: 0,year,month,day,school_holiday,public_holiday
0,2023.0,1.0,1.0,1.0,1.0
1,2023.0,1.0,2.0,1.0,0.0
2,2023.0,1.0,3.0,1.0,0.0
3,2023.0,1.0,4.0,0.0,0.0
4,2023.0,1.0,5.0,0.0,0.0
...,...,...,...,...,...
726,2024.0,12.0,27.0,1.0,0.0
727,2024.0,12.0,28.0,1.0,0.0
728,2024.0,12.0,29.0,1.0,0.0
729,2024.0,12.0,30.0,1.0,0.0


### All data in one dataframe

In [None]:
complete_df = hourly_weather_df.merge(
    holidays_df,
    on=['year', 'month', 'day'],
    how='left'
)
complete_df

Unnamed: 0,year,month,day,hour,tempC,windspeedKmph,winddirDegree,precipMM,humidity,visibility,...,cloudcover,HeatIndexC,DewPointC,WindChillC,WindGustKmph,FeelsLikeC,uvIndex,totalSnow_cm,school_holiday,public_holiday
0,2023.0,1.0,1.0,0.0,14.0,27.0,205.0,0.0,61.0,10.0,...,23.0,14.0,6.0,11.0,50.0,11.0,1.0,0.0,1.0,1.0
1,2023.0,1.0,1.0,100.0,14.0,25.0,204.0,0.0,62.0,10.0,...,31.0,14.0,6.0,11.0,47.0,11.0,1.0,0.0,1.0,1.0
2,2023.0,1.0,1.0,200.0,13.0,23.0,204.0,0.0,63.0,10.0,...,40.0,13.0,7.0,12.0,44.0,12.0,1.0,0.0,1.0,1.0
3,2023.0,1.0,1.0,300.0,13.0,22.0,204.0,0.0,64.0,10.0,...,48.0,13.0,7.0,11.0,42.0,11.0,1.0,0.0,1.0,1.0
4,2023.0,1.0,1.0,400.0,13.0,21.0,205.0,0.0,65.0,10.0,...,41.0,13.0,7.0,11.0,40.0,11.0,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16891,2024.0,12.0,4.0,1900.0,7.0,10.0,210.0,0.0,72.0,10.0,...,41.0,7.0,2.0,5.0,16.0,5.0,1.0,0.0,0.0,0.0
16892,2024.0,12.0,4.0,2000.0,7.0,11.0,214.0,0.0,72.0,10.0,...,39.0,7.0,2.0,5.0,18.0,5.0,1.0,0.0,0.0,0.0
16893,2024.0,12.0,4.0,2100.0,7.0,11.0,216.0,0.0,74.0,10.0,...,27.0,7.0,2.0,4.0,18.0,4.0,1.0,0.0,0.0,0.0
16894,2024.0,12.0,4.0,2200.0,7.0,12.0,223.0,0.0,74.0,10.0,...,63.0,7.0,2.0,4.0,19.0,4.0,1.0,0.0,0.0,0.0
