In [None]:
import requests
import pandas as pd
import numpy as np 
from tqdm import tqdm


In [81]:
def get_names_geo_data_from_sncf_api(endpoint_suffix,**kwargs):
    base_url = "https://ressources.data.sncf.com"
    # Endpoint for the desired dataset
    endpoint = f"/api/explore/v2.1/catalog/datasets/{endpoint_suffix}/records"
    # Parameters for the API request
    params = {
        
        "limit": 100,  # in this API maximum limit is 100
        "offset": 0,  # we start from 0 to 100, then FROM 100 to 200 etc etc, but limit is fixed at 100, it is moving
        
    }
    params.update(kwargs)
    # Construct the full URL
    url = f"{base_url}{endpoint}"
    response = requests.get(url, params=params)
    wb = response.json()
    resulting_dictionnary = wb["results"].copy()
    while wb["results"] != []:
        params["offset"] = params["offset"] + 100
        response = requests.get(url, params=params)
        if response.status_code == 200:
            wb = response.json()
            for element in wb["results"]:
                resulting_dictionnary.append(element)
    # verify nb of observations
    print(f"nb of stations downloaded: {len(resulting_dictionnary)}")
    df = pd.json_normalize(resulting_dictionnary)
    return df
def get_absent_lat_lon_from_gouv_api(df):
    empties = df[df["lon"].isna()].copy()
    base_url = "https://api-adresse.data.gouv.fr/search/"
    # Paramètres de la requête
    params = {"q": "", "limit": 1}
    i = 0
    for idx, row in empties.iterrows():
        params["q"] = "gare de "+row["nom_gare"]
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            coordinates = data["features"][0]["geometry"]["coordinates"]
            i = i + 1
        else:
            coordinates = [None, None]
        df.loc[idx, "lon"] = coordinates[0]
        df.loc[idx, "lat"] = coordinates[1]
    print(i, "absent addresses filled successfully")
    return df

In [82]:
df = get_names_geo_data_from_sncf_api(
    endpoint_suffix="gares-de-voyageurs",
    select="""nom as nom_gare,
                           position_geographique,
                           codeinsee,
                           codes_uic as uic""",
)
df = df.drop("position_geographique", axis=1)

df.rename(
    columns={
        "position_geographique.lon": "lon",
        "position_geographique.lat": "lat",
    },
    inplace=True,
)


nb of stations downloaded: 2881


In [83]:
frequentation = get_names_geo_data_from_sncf_api(
    endpoint_suffix="frequentation-gares",
    select="""total_voyageurs_2023,
                           code_postal,
                           nom_gare,
                           code_uic_complet as uic """,
)

nb of stations downloaded: 3010


In [84]:
property = get_names_geo_data_from_sncf_api(
    endpoint_suffix="proprete-en-gare",
    select="""nom_gare,
                           uic ,
                           avg(taux_de_conformite) as taux_de_conformite """,
    refine='mois:"2023"',
    group_by="nom_gare, uic "
    
)

nb of stations downloaded: 1465


In [85]:
property["uic"]=property["uic"].str[2:]

In [87]:
merged=frequentation.merge(df.drop("nom_gare",axis=1),on=["uic"],how="left")
merged=merged[merged["total_voyageurs_2023"]>0]


In [88]:
merged=merged.merge(property.drop("nom_gare",axis=1),on=["uic"],how="left")

In [90]:
tgv = get_names_geo_data_from_sncf_api(
    endpoint_suffix="regularite-mensuelle-tgv-aqst",
    select=""" 
                            gare_depart,
                            avg(duree_moyenne) as duree_moyenne ,
                            sum(nb_train_prevu) as nb_train_prevu ,
                            sum(nb_annulation) as nb_annulation , 
                            sum(nb_train_depart_retard) as nb_train_depart_retard, 
                            avg(retard_moyen_depart) as  retard_moyen_depart  """,
    refine='date:"2023"',
    group_by="gare_depart "
    
)

nb of stations downloaded: 57


In [91]:
wifi = get_names_geo_data_from_sncf_api(
    endpoint_suffix="gares-equipees-du-wifi",
    select=""" 
                            uic,
                            nom_de_la_gare as nom_gare ,
                            service_wifi  
                            """,   
)

nb of stations downloaded: 2997


In [None]:
merged["uic"]=merged["uic"].str[2:]


In [93]:
merged=merged.merge(wifi.drop("nom_gare",axis=1),on=["uic"],how="left")

In [94]:
merged

Unnamed: 0,total_voyageurs_2023,code_postal,nom_gare,uic,codeinsee,lon,lat,taux_de_conformite,service_wifi
0,54862,44170,Abbaretz,481614,44001,-1.524416,47.554643,90.326087,Non
1,188426,78100,Achères Grand Cormier,386052,78551,2.091903,48.955183,,Non
2,82597,62121,Achiet-le-Grand,342048,62005,2.780168,50.131752,,Non
3,39364,83530,Agay,757559,83118,6.856500,43.431370,90.895105,Non
4,17034,30220,Aigues-Mortes,775858,30003,4.191210,43.570901,97.643098,Non
...,...,...,...,...,...,...,...,...,...
2931,189583,67720,Weyersheim,213678,67529,7.797103,48.717790,96.428571,Non
2932,41410,68760,Willer-sur-Thur,182584,68372,7.069320,47.842549,,Non
2933,35330,62126,Wimille - Wimereux,317123,62894,1.613746,50.763846,,Non
2934,43728,40100,Ygos-Saint-Saturnin,671487,40333,-0.736153,43.978185,90.454545,Non


In [98]:
tgv.sample(5)

Unnamed: 0,gare_depart,duree_moyenne,nb_train_prevu,nb_annulation,nb_train_depart_retard,retard_moyen_depart
53,TOURS,77.083333,2139,72,188,10.277803
16,GENEVE,194.666667,2790,109,1819,4.419866
56,ZURICH,248.666667,1905,52,623,1.705202
48,STRASBOURG,224.875,7394,177,3730,10.550119
55,VANNES,166.666667,3398,87,1857,8.770667


In [96]:

merged=get_absent_lat_lon_from_gouv_api(merged)
merged.isna().sum()

90 absent addresses filled successfully


total_voyageurs_2023       0
code_postal                0
nom_gare                   0
uic                        0
codeinsee                 90
lon                        0
lat                        0
taux_de_conformite      1474
service_wifi              49
dtype: int64