In [30]:
# Imports

import requests
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import calendar
from jours_feries_france import JoursFeries

### Collect Road data

In [56]:
def get_road_data_by_date(year, month, day, libelle, libelle_nd_amont, libelle_nd_aval):

    # API Url
    api_url = "https://opendata.paris.fr/api/explore/v2.1/catalog/datasets/comptages-routiers-permanents/records"

    # Request parameters
    params = {
        "limit": 24,
        "order_by": "t_1h",
        "timezone": "CET",
        "refine": [
            f"t_1h:{year}/{month}/{day}",
            f"libelle_nd_amont:{libelle_nd_amont}",
            f"libelle_nd_aval:{libelle_nd_aval}",
            f"libelle:{libelle}"
        ]
    }

    # Get response
    response = requests.get(api_url, params=params)

    # Fetch data
    if response.status_code == 200:
        records = response.json().get("results", [])
        if records:
            df = pd.DataFrame(columns=["year", "month", "day", "hour", "libelle", "flow rate", "occupancy rate", "trafic state"])
            for index, record in enumerate(records):
                t_1h = record["t_1h"]
                rue = record['libelle']
                q = record["q"]
                k = record["k"]
                trafic_state = record["etat_trafic"]
                year = t_1h[:4]
                month = t_1h[5:7]
                day = t_1h[8:10]
                hour = t_1h[11:13]
                
                df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
        else:
            print(f"No data found ({year}/{month}/{day}).")
            df = pd.DataFrame(columns=["year", "month", "day", "hour", "libelle", "flow rate", "occupancy rate", "trafic state"])
    else:
        print(f"API Error : {response.status_code}")
    
    empty_df = pd.DataFrame({"year": year, "month": month, "day": day, "hour": [str(i).zfill(2) for i in range(24)]})

    filled_df = pd.merge(df, empty_df, on=['year', 'month', 'day', 'hour'], how="outer")
    return filled_df

In [81]:
print(get_road_data_by_date("2023", "11", "15", "AV_Champs_Elysees", "Av_Champs_Elysees-Washington", "Av_Champs_Elysees-Berri"))
#print(get_road_data_by_date("2023", "12", "06", "Convention", "Convention-Blomet", "Lecourbe-Convention"))
#print(get_road_data_by_date("2023", "12", "06", "St_Antoine", "Bastille-St_Antoine", "St_Antoine-Jacques_Coeur"))

# q: h flow rate (number of vehicle per hour)
# k: occupancy rate (time spent by a vehicle in one hour, in %, 100% = 1 hour)

    year month day hour            libelle  flow rate  occupancy rate  \
0   2023    11  15   00  AV_Champs_Elysees      920.0        22.77945   
1   2023    11  15   01  AV_Champs_Elysees      852.0        20.30778   
2   2023    11  15   02  AV_Champs_Elysees      905.0        42.79500   
3   2023    11  15   03  AV_Champs_Elysees      747.0        28.79945   
4   2023    11  15   04  AV_Champs_Elysees      574.0        37.76333   
5   2023    11  15   05  AV_Champs_Elysees      647.0        34.72278   
6   2023    11  15   06  AV_Champs_Elysees      669.0        23.08222   
7   2023    11  15   07  AV_Champs_Elysees     1122.0        20.54611   
8   2023    11  15   08  AV_Champs_Elysees     1501.0        32.66723   
9   2023    11  15   09  AV_Champs_Elysees     1659.0        45.52444   
10  2023    11  15   10  AV_Champs_Elysees     1588.0        34.66723   
11  2023    11  15   11  AV_Champs_Elysees     1568.0        37.51278   
12  2023    11  15   12  AV_Champs_Elysees     1383

In [58]:
def get_road_data_per_month(year, month, libelle, libelle_nd_amont, libelle_nd_aval):

    number_of_days = calendar.monthrange(int(year), int(month))[1]

    dfs = []

    for index in range(number_of_days):
        day = str(index + 1).zfill(2)
        dfs.append(get_road_data_by_date(year, month, day, libelle, libelle_nd_amont, libelle_nd_aval))
    
    df = pd.concat(dfs, ignore_index=True)
    return df

In [59]:
df_av_champs_2023 = pd.DataFrame()
df_convention_2023 = pd.DataFrame()
df_st_antoine_2023 = pd.DataFrame()

for i in range(1,13) :
    df_1 = get_road_data_per_month("2023", str(i), "AV_Champs_Elysees", "Av_Champs_Elysees-Washington", "Av_Champs_Elysees-Berri")
    df_2 = get_road_data_per_month("2023", str(i), "Convention", "Convention-Blomet", "Lecourbe-Convention")
    df_3 = get_road_data_per_month("2023", str(i), "St_Antoine", "Bastille-St_Antoine", "St_Antoine-Jacques_Coeur")
    df_av_champs_2023 = pd.concat([df_av_champs_2023, df_1])
    df_convention_2023 = pd.concat([df_convention_2023, df_2])
    df_st_antoine_2023 = pd.concat([df_st_antoine_2023, df_3])

No data found (2023/1/01).
No data found (2023/1/02).
No data found (2023/1/03).
No data found (2023/1/04).
No data found (2023/1/05).
No data found (2023/1/06).
No data found (2023/1/07).
No data found (2023/1/08).
No data found (2023/1/09).
No data found (2023/1/10).
No data found (2023/1/11).
No data found (2023/1/12).
No data found (2023/1/13).
No data found (2023/1/14).
No data found (2023/1/15).
No data found (2023/1/16).
No data found (2023/1/17).
No data found (2023/1/18).
No data found (2023/1/19).
No data found (2023/1/20).
No data found (2023/1/21).
No data found (2023/1/22).
No data found (2023/1/23).
No data found (2023/1/24).
No data found (2023/1/25).
No data found (2023/1/26).
No data found (2023/1/27).
No data found (2023/1/28).
No data found (2023/1/29).
No data found (2023/1/30).
No data found (2023/1/31).
No data found (2023/1/01).
No data found (2023/1/02).
No data found (2023/1/03).
No data found (2023/1/04).
No data found (2023/1/05).
No data found (2023/1/06).
N

  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_st

In [60]:
df_av_champs_2023.shape

(8760, 8)

In [61]:
df_convention_2023.shape

(8760, 8)

In [62]:
df_st_antoine_2023.shape

(8760, 8)

In [63]:
df_complete_2023_trafic = pd.concat([df_av_champs_2023, df_convention_2023, df_st_antoine_2023])

In [64]:
df_complete_2023_trafic.shape

(26280, 8)

In [65]:
df_av_champs_2024 = pd.DataFrame()
df_convention_2024 = pd.DataFrame()
df_st_antoine_2024 = pd.DataFrame()

for i in range(1,13) :
    df_1 = get_road_data_per_month("2024", str(i), "AV_Champs_Elysees", "Av_Champs_Elysees-Washington", "Av_Champs_Elysees-Berri")
    df_2 = get_road_data_per_month("2024", str(i), "Convention", "Convention-Blomet", "Lecourbe-Convention")
    df_3 = get_road_data_per_month("2024", str(i), "St_Antoine", "Bastille-St_Antoine", "St_Antoine-Jacques_Coeur")
    df_av_champs_2024 = pd.concat([df_av_champs_2024, df_1])
    df_convention_2024 = pd.concat([df_convention_2024, df_2])
    df_st_antoine_2024 = pd.concat([df_st_antoine_2024, df_3])

  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_st

No data found (2024/5/05).
No data found (2024/5/06).
No data found (2024/5/09).


  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_st

No data found (2024/5/05).
No data found (2024/5/06).
No data found (2024/5/07).
No data found (2024/5/08).
No data found (2024/5/09).


  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df = pd.concat(dfs, ignore_index=True)


No data found (2024/5/05).
No data found (2024/5/06).
No data found (2024/5/09).


  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df = pd.concat(dfs, ignore_index=True)
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [ye

No data found (2024/6/14).
No data found (2024/6/15).
No data found (2024/6/16).
No data found (2024/6/17).
No data found (2024/6/18).
No data found (2024/6/19).
No data found (2024/6/20).
No data found (2024/6/21).
No data found (2024/6/22).
No data found (2024/6/23).


  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]


No data found (2024/6/29).
No data found (2024/6/30).


  df = pd.concat(dfs, ignore_index=True)
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]


No data found (2024/6/14).
No data found (2024/6/15).
No data found (2024/6/16).
No data found (2024/6/17).
No data found (2024/6/18).
No data found (2024/6/19).
No data found (2024/6/20).
No data found (2024/6/21).
No data found (2024/6/22).
No data found (2024/6/29).
No data found (2024/6/30).


  df = pd.concat(dfs, ignore_index=True)
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]


No data found (2024/6/14).
No data found (2024/6/15).
No data found (2024/6/16).
No data found (2024/6/17).
No data found (2024/6/18).
No data found (2024/6/19).
No data found (2024/6/20).
No data found (2024/6/21).
No data found (2024/6/22).
No data found (2024/6/23).
No data found (2024/6/29).
No data found (2024/6/30).


  df = pd.concat(dfs, ignore_index=True)


No data found (2024/7/01).
No data found (2024/7/02).
No data found (2024/7/03).
No data found (2024/7/04).
No data found (2024/7/05).
No data found (2024/7/06).
No data found (2024/7/07).
No data found (2024/7/08).
No data found (2024/7/09).


  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df = pd.concat(dfs, ignore_index=True)


No data found (2024/7/01).
No data found (2024/7/02).
No data found (2024/7/03).
No data found (2024/7/04).
No data found (2024/7/05).
No data found (2024/7/06).
No data found (2024/7/07).
No data found (2024/7/08).
No data found (2024/7/09).


  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df = pd.concat(dfs, ignore_index=True)


No data found (2024/7/01).
No data found (2024/7/02).
No data found (2024/7/03).
No data found (2024/7/04).
No data found (2024/7/05).
No data found (2024/7/06).
No data found (2024/7/07).
No data found (2024/7/08).
No data found (2024/7/09).


  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df = pd.concat(dfs, ignore_index=True)
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [ye

No data found (2024/11/13).
No data found (2024/11/14).
No data found (2024/11/16).
No data found (2024/11/17).
No data found (2024/11/18).
No data found (2024/11/19).
No data found (2024/11/20).
No data found (2024/11/21).
No data found (2024/11/23).


  df = pd.concat(dfs, ignore_index=True)
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]


No data found (2024/11/13).
No data found (2024/11/14).
No data found (2024/11/16).
No data found (2024/11/17).
No data found (2024/11/18).
No data found (2024/11/19).
No data found (2024/11/20).
No data found (2024/11/21).
No data found (2024/11/23).


  df = pd.concat(dfs, ignore_index=True)
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]


No data found (2024/11/13).
No data found (2024/11/14).
No data found (2024/11/16).
No data found (2024/11/17).
No data found (2024/11/18).
No data found (2024/11/19).
No data found (2024/11/20).
No data found (2024/11/21).
No data found (2024/11/23).


  df = pd.concat(dfs, ignore_index=True)


No data found (2024/12/05).
No data found (2024/12/06).
No data found (2024/12/07).
No data found (2024/12/08).
No data found (2024/12/09).
No data found (2024/12/10).
No data found (2024/12/11).
No data found (2024/12/12).
No data found (2024/12/13).
No data found (2024/12/14).
No data found (2024/12/15).
No data found (2024/12/16).
No data found (2024/12/17).
No data found (2024/12/18).
No data found (2024/12/19).
No data found (2024/12/20).
No data found (2024/12/21).
No data found (2024/12/22).
No data found (2024/12/23).
No data found (2024/12/24).
No data found (2024/12/25).
No data found (2024/12/26).
No data found (2024/12/27).
No data found (2024/12/28).
No data found (2024/12/29).
No data found (2024/12/30).
No data found (2024/12/31).


  df = pd.concat(dfs, ignore_index=True)
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]
  df.loc[index] = [year, month, day, hour,rue, q, k, trafic_state]


No data found (2024/12/05).
No data found (2024/12/06).
No data found (2024/12/07).
No data found (2024/12/08).
No data found (2024/12/09).
No data found (2024/12/10).
No data found (2024/12/11).
No data found (2024/12/12).
No data found (2024/12/13).
No data found (2024/12/14).
No data found (2024/12/15).
No data found (2024/12/16).
No data found (2024/12/17).
No data found (2024/12/18).
No data found (2024/12/19).
No data found (2024/12/20).
No data found (2024/12/21).
No data found (2024/12/22).
No data found (2024/12/23).
No data found (2024/12/24).
No data found (2024/12/25).
No data found (2024/12/26).
No data found (2024/12/27).
No data found (2024/12/28).
No data found (2024/12/29).
No data found (2024/12/30).
No data found (2024/12/31).


  df = pd.concat(dfs, ignore_index=True)


No data found (2024/12/05).
No data found (2024/12/06).
No data found (2024/12/07).
No data found (2024/12/08).
No data found (2024/12/09).
No data found (2024/12/10).
No data found (2024/12/11).
No data found (2024/12/12).
No data found (2024/12/13).
No data found (2024/12/14).
No data found (2024/12/15).
No data found (2024/12/16).
No data found (2024/12/17).
No data found (2024/12/18).
No data found (2024/12/19).
No data found (2024/12/20).
No data found (2024/12/21).
No data found (2024/12/22).
No data found (2024/12/23).
No data found (2024/12/24).
No data found (2024/12/25).
No data found (2024/12/26).
No data found (2024/12/27).
No data found (2024/12/28).
No data found (2024/12/29).
No data found (2024/12/30).
No data found (2024/12/31).


  df = pd.concat(dfs, ignore_index=True)


In [66]:
df_complete_2024_trafic = pd.concat([df_av_champs_2024, df_convention_2024, df_st_antoine_2024])

In [67]:
df_complete_2024_trafic.shape

(26352, 8)

In [68]:
df_complete_2023_trafic.shape

(26280, 8)

In [74]:
df_complete_trafic = pd.concat([df_complete_2023_trafic, df_complete_2024_trafic])
df_complete_trafic.shape

(52632, 8)

### Collect Weather data

In [22]:
def fetch_weather_data(start, end=None):
    """
    Uses Weather API to collect weather data for a certain day.
    """
    
    params = {
        'date': start, 
        'q': "paris,france",
        'format': "json", 
        'tp': "1", 
        'key': "694c007d0c1b4641a3f155842240312"
        }
    
    if end:
        params['enddate'] = end

    request = requests.get(url="https://api.worldweatheronline.com/premium/v1/past-weather.ashx", params=params)
    json_data = request.json()

    if json_data['data']['weather']:
        return json_data['data']['weather']
    else:
        print("Error when fetching weather data")

In [23]:
def generate_month_ranges(start_year, start_month):
    """
    Generates a list of tuples starting from a certain year. 
    Each tuple contains the first and last days of the month.
    """
    
    current_date = datetime.now()
    start_date = datetime(start_year, start_month, 1)
    month_ranges = []

    while start_date < current_date:
        end_date = (start_date + timedelta(days=32)).replace(day=1) - timedelta(days=1)
        if end_date > current_date:
            end_date = current_date
        month_ranges.append((start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d")))
        start_date = end_date + timedelta(days=1)

    return month_ranges

In [24]:
def fetch_all_weather_data(start_year, start_month):
    """
    Collects all weather data starting from a certain year, using Weather API.
    """
    
    month_ranges = generate_month_ranges(start_year, start_month)
    all_weather_data = []

    for start, end in tqdm(month_ranges, desc="Fetching weather data", unit="month"):
        data = fetch_weather_data(start=start, end=end)
        if data:
            all_weather_data.extend(data)

    return all_weather_data

In [25]:
data = fetch_all_weather_data(start_year=2023, start_month=1)

Fetching weather data:   0%|          | 0/24 [00:00<?, ?month/s]

Fetching weather data: 100%|██████████| 24/24 [00:11<00:00,  2.11month/s]


In [26]:
def collect_daily_weather_data(data):
    """
    Generates a dataframe with daily weather data.
    """

    general_data = []  

    for i in range(len(data)):
        general_data.append({
            'year': data[i]['date'][:4], 
            'month': data[i]['date'][5:7],
            'day': data[i]['date'][8:10],
            'maxtempC': data[i]['maxtempC'],
            'mintempC': data[i]['mintempC'],
            'avgtempC': data[i]['avgtempC'],
            'totalSnow_cm': data[i]['totalSnow_cm'],
            'sunHour': data[i]['sunHour'],
            'uvIndex': data[i]['uvIndex']
        })

    return pd.DataFrame(general_data).astype(float)


daily_weather_df = collect_daily_weather_data(data)

In [27]:
def collect_hourly_weather_data(data):
    """
    Generates a dataframe with hourly weather data.
    """

    hourly_data = []  

    for i in range(len(data)):
        for j in range(24):
            hourly_data.append({
                'year': data[i]['date'][:4], 
                'month': data[i]['date'][5:7],
                'day': data[i]['date'][8:10],
                'hour': data[i]['hourly'][j]['time'] , 
                'tempC': data[i]['hourly'][j]['tempC'],
                'windspeedKmph': data[i]['hourly'][j]['windspeedKmph'],
                'winddirDegree': data[i]['hourly'][j]['winddirDegree'],
                'precipMM': data[i]['hourly'][j]['precipMM'],
                'humidity': data[i]['hourly'][j]['humidity'],
                'visibility': data[i]['hourly'][j]['visibility'],
                'pressure': data[i]['hourly'][j]['pressure'],
                'cloudcover': data[i]['hourly'][j]['cloudcover'],
                'HeatIndexC': data[i]['hourly'][j]['HeatIndexC'],
                'DewPointC': data[i]['hourly'][j]['DewPointC'],
                'WindChillC': data[i]['hourly'][j]['WindChillC'],
                'WindGustKmph': data[i]['hourly'][j]['WindGustKmph'],
                'FeelsLikeC': data[i]['hourly'][j]['FeelsLikeC'],
                'uvIndex': data[i]['hourly'][j]['uvIndex'],
            })

    return pd.DataFrame(hourly_data).astype(float)

hourly_weather_df = collect_hourly_weather_data(data)

In [28]:
hourly_weather_df = hourly_weather_df.merge(
    daily_weather_df[['year', 'month', 'day', 'totalSnow_cm']],
    on=['year', 'month', 'day'],
    how='left'
)
hourly_weather_df

Unnamed: 0,year,month,day,hour,tempC,windspeedKmph,winddirDegree,precipMM,humidity,visibility,pressure,cloudcover,HeatIndexC,DewPointC,WindChillC,WindGustKmph,FeelsLikeC,uvIndex,totalSnow_cm
0,2023.0,1.0,1.0,0.0,14.0,27.0,205.0,0.0,61.0,10.0,1015.0,23.0,14.0,6.0,11.0,50.0,11.0,1.0,0.0
1,2023.0,1.0,1.0,100.0,14.0,25.0,204.0,0.0,62.0,10.0,1016.0,31.0,14.0,6.0,11.0,47.0,11.0,1.0,0.0
2,2023.0,1.0,1.0,200.0,13.0,23.0,204.0,0.0,63.0,10.0,1016.0,40.0,13.0,7.0,12.0,44.0,12.0,1.0,0.0
3,2023.0,1.0,1.0,300.0,13.0,22.0,204.0,0.0,64.0,10.0,1016.0,48.0,13.0,7.0,11.0,42.0,11.0,1.0,0.0
4,2023.0,1.0,1.0,400.0,13.0,21.0,205.0,0.0,65.0,10.0,1016.0,41.0,13.0,7.0,11.0,40.0,11.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16891,2024.0,12.0,4.0,1900.0,7.0,10.0,210.0,0.0,72.0,10.0,1028.0,41.0,7.0,2.0,5.0,16.0,5.0,1.0,0.0
16892,2024.0,12.0,4.0,2000.0,7.0,11.0,214.0,0.0,72.0,10.0,1028.0,39.0,7.0,2.0,5.0,18.0,5.0,1.0,0.0
16893,2024.0,12.0,4.0,2100.0,7.0,11.0,216.0,0.0,74.0,10.0,1028.0,27.0,7.0,2.0,4.0,18.0,4.0,1.0,0.0
16894,2024.0,12.0,4.0,2200.0,7.0,12.0,223.0,0.0,74.0,10.0,1028.0,63.0,7.0,2.0,4.0,19.0,4.0,1.0,0.0


### Collect School Holidays + Public Holidays

In [31]:
def get_school_holidays(year):

    # API Url
    api_url = "https://data.education.gouv.fr/api/explore/v2.1/catalog/datasets/fr-en-calendrier-scolaire/records"

    # Request parameters
    params = {
        "limit": -1,
        #"order_by": "t_1h",
        "timezone": "CET",
        "where": f'location="Paris" AND (annee_scolaire = "{str(year-1)}-{str(year)}" OR annee_scolaire = "{str(year)}-{str(year+1)}")'
    }

    # Get response
    response = requests.get(api_url, params=params)

    # Fetch data
    if response.status_code == 200:
        records = response.json().get("results", [])
        if records:
            start = []
            end = []
            for index, record in enumerate(records):
                start_date = record["start_date"]
                syear = int(start_date[:4])
                smonth = int(start_date[5:7])
                sday = int(start_date[8:10])
                end_date = record["end_date"]
                eyear = int(end_date[:4])
                emonth = int(end_date[5:7])
                eday = int(end_date[8:10])
                
                start.append(datetime(syear, smonth, sday))
                end.append(datetime(eyear, emonth, eday))
        else:
            print("No data found.")
    else:
        print(f"API Error : {response.status_code}")

    return start, end

In [32]:
def get_public_holidays(year):
    dates = JoursFeries.for_year(year)
    return list(dates.values())

In [33]:
print(get_school_holidays(2024))
print(get_public_holidays(2024))

([datetime.datetime(2024, 12, 21, 0, 0), datetime.datetime(2025, 4, 12, 0, 0), datetime.datetime(2024, 5, 9, 0, 0), datetime.datetime(2025, 5, 29, 0, 0), datetime.datetime(2023, 12, 23, 0, 0), datetime.datetime(2024, 2, 10, 0, 0), datetime.datetime(2025, 7, 5, 0, 0), datetime.datetime(2024, 4, 6, 0, 0), datetime.datetime(2023, 10, 21, 0, 0), datetime.datetime(2024, 7, 6, 0, 0), datetime.datetime(2024, 7, 6, 0, 0), datetime.datetime(2024, 10, 19, 0, 0), datetime.datetime(2025, 2, 15, 0, 0), datetime.datetime(2025, 7, 5, 0, 0)], [datetime.datetime(2025, 1, 6, 0, 0), datetime.datetime(2025, 4, 28, 0, 0), datetime.datetime(2024, 5, 13, 0, 0), datetime.datetime(2025, 6, 2, 0, 0), datetime.datetime(2024, 1, 8, 0, 0), datetime.datetime(2024, 2, 26, 0, 0), datetime.datetime(2025, 8, 29, 0, 0), datetime.datetime(2024, 4, 22, 0, 0), datetime.datetime(2023, 11, 6, 0, 0), datetime.datetime(2024, 9, 2, 0, 0), datetime.datetime(2024, 8, 30, 0, 0), datetime.datetime(2024, 11, 4, 0, 0), datetime.datet

In [34]:
def get_holidays_df(target_year):
    dates = pd.date_range(start=f"{target_year}-01-01", end=f"{target_year}-12-31", freq="D")

    df = pd.DataFrame({
        "year": dates.year,
        "month": dates.month,
        "day": dates.day,
        "school_holiday": [0] * len(dates),  # Première colonne vide
        "public_holiday": [0] * len(dates)   # Deuxième colonne vide
    })

    start_dates, end_dates = get_school_holidays(target_year)

    for index in range(len(start_dates)):
        sday = start_dates[index]
        eday = end_dates[index]

        sh_days = pd.date_range(start=sday, end=eday, freq="D")
        sh_days_list = sh_days.tolist()

        for sh_day in sh_days_list:
            year, month, day = sh_day.year, sh_day.month, sh_day.day
            if year != target_year:
                continue
            else:
                mask = (df["year"] == year) & (df["month"] == month) & (df["day"] == day)
                df.loc[mask, "school_holiday"] = 1
        
        ph_days = get_public_holidays(target_year)
        for ph_day in ph_days:
            year, month, day = ph_day.year, ph_day.month, ph_day.day
            mask = (df["year"] == year) & (df["month"] == month) & (df["day"] == day)
            df.loc[mask, "public_holiday"] = 1

    return df.astype(float)

In [35]:
holidays_df = pd.concat([get_holidays_df(2023), get_holidays_df(2024)], ignore_index=True)
holidays_df

Unnamed: 0,year,month,day,school_holiday,public_holiday
0,2023.0,1.0,1.0,1.0,1.0
1,2023.0,1.0,2.0,1.0,0.0
2,2023.0,1.0,3.0,1.0,0.0
3,2023.0,1.0,4.0,0.0,0.0
4,2023.0,1.0,5.0,0.0,0.0
...,...,...,...,...,...
726,2024.0,12.0,27.0,1.0,0.0
727,2024.0,12.0,28.0,1.0,0.0
728,2024.0,12.0,29.0,1.0,0.0
729,2024.0,12.0,30.0,1.0,0.0


### All data in one dataframe

In [36]:
complete_df = hourly_weather_df.merge(
    holidays_df,
    on=['year', 'month', 'day'],
    how='left'
)
complete_df

Unnamed: 0,year,month,day,hour,tempC,windspeedKmph,winddirDegree,precipMM,humidity,visibility,...,cloudcover,HeatIndexC,DewPointC,WindChillC,WindGustKmph,FeelsLikeC,uvIndex,totalSnow_cm,school_holiday,public_holiday
0,2023.0,1.0,1.0,0.0,14.0,27.0,205.0,0.0,61.0,10.0,...,23.0,14.0,6.0,11.0,50.0,11.0,1.0,0.0,1.0,1.0
1,2023.0,1.0,1.0,100.0,14.0,25.0,204.0,0.0,62.0,10.0,...,31.0,14.0,6.0,11.0,47.0,11.0,1.0,0.0,1.0,1.0
2,2023.0,1.0,1.0,200.0,13.0,23.0,204.0,0.0,63.0,10.0,...,40.0,13.0,7.0,12.0,44.0,12.0,1.0,0.0,1.0,1.0
3,2023.0,1.0,1.0,300.0,13.0,22.0,204.0,0.0,64.0,10.0,...,48.0,13.0,7.0,11.0,42.0,11.0,1.0,0.0,1.0,1.0
4,2023.0,1.0,1.0,400.0,13.0,21.0,205.0,0.0,65.0,10.0,...,41.0,13.0,7.0,11.0,40.0,11.0,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16891,2024.0,12.0,4.0,1900.0,7.0,10.0,210.0,0.0,72.0,10.0,...,41.0,7.0,2.0,5.0,16.0,5.0,1.0,0.0,0.0,0.0
16892,2024.0,12.0,4.0,2000.0,7.0,11.0,214.0,0.0,72.0,10.0,...,39.0,7.0,2.0,5.0,18.0,5.0,1.0,0.0,0.0,0.0
16893,2024.0,12.0,4.0,2100.0,7.0,11.0,216.0,0.0,74.0,10.0,...,27.0,7.0,2.0,4.0,18.0,4.0,1.0,0.0,0.0,0.0
16894,2024.0,12.0,4.0,2200.0,7.0,12.0,223.0,0.0,74.0,10.0,...,63.0,7.0,2.0,4.0,19.0,4.0,1.0,0.0,0.0,0.0


In [37]:
complete_df.shape

(16896, 21)

In [75]:
data = pd.concat([df_complete_trafic, complete_df])
data.shape

(69528, 25)

In [77]:
data.columns

Index(['year', 'month', 'day', 'hour', 'libelle', 'flow rate',
       'occupancy rate', 'trafic state', 'tempC', 'windspeedKmph',
       'winddirDegree', 'precipMM', 'humidity', 'visibility', 'pressure',
       'cloudcover', 'HeatIndexC', 'DewPointC', 'WindChillC', 'WindGustKmph',
       'FeelsLikeC', 'uvIndex', 'totalSnow_cm', 'school_holiday',
       'public_holiday'],
      dtype='object')

In [78]:
data.isna().sum()

year                  0
month                 0
day                   0
hour                  0
libelle           44436
flow rate         45004
occupancy rate    45224
trafic state      44436
tempC             52632
windspeedKmph     52632
winddirDegree     52632
precipMM          52632
humidity          52632
visibility        52632
pressure          52632
cloudcover        52632
HeatIndexC        52632
DewPointC         52632
WindChillC        52632
WindGustKmph      52632
FeelsLikeC        52632
uvIndex           52632
totalSnow_cm      52632
school_holiday    52632
public_holiday    52632
dtype: int64

In [79]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 69528 entries, 0 to 16895
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   year            69528 non-null  object 
 1   month           69528 non-null  object 
 2   day             69528 non-null  object 
 3   hour            69528 non-null  object 
 4   libelle         25092 non-null  object 
 5   flow rate       24524 non-null  float64
 6   occupancy rate  24304 non-null  float64
 7   trafic state    25092 non-null  object 
 8   tempC           16896 non-null  float64
 9   windspeedKmph   16896 non-null  float64
 10  winddirDegree   16896 non-null  float64
 11  precipMM        16896 non-null  float64
 12  humidity        16896 non-null  float64
 13  visibility      16896 non-null  float64
 14  pressure        16896 non-null  float64
 15  cloudcover      16896 non-null  float64
 16  HeatIndexC      16896 non-null  float64
 17  DewPointC       16896 non-null  floa

In [47]:
data_numeric = data.drop(columns=['trafic state'])
data_numeric.shape

(69528, 23)

### Impute missing values 

In [None]:
# from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier 

In [None]:
# # Fill missing values
# def fill_missing_with_rf(data, target_column):

#     df_train = data[data[target_column].notna()]
#     df_missing = data[data[target_column].isna()]
    
#     X_train = df_train.drop(columns=[target_column])
#     y_train = df_train[target_column]
#     X_missing = df_missing.drop(columns=[target_column])
    
#     if pd.api.types.is_numeric_dtype(data[target_column]):
#         model = RandomForestRegressor()
#     else:
        
#         model = RandomForestClassifier()
    
#     model.fit(X_train, y_train)
    
#     predictions = model.predict(X_missing)

#     data.loc[data[target_column].isna(), target_column] = predictions
#     return data

In [84]:
# data_filled = fill_missing_with_rf(data_numeric,'flow rate')
# data_filled.shape