In [8]:
import requests
import pandas as pd
import numpy as np 


In [70]:
def get_names_geo_data_from_sncf_api(endpoint_suffix,variables_in_string,**kwargs):
    base_url = "https://ressources.data.sncf.com"
    # Endpoint for the desired dataset
    endpoint = f"/api/explore/v2.1/catalog/datasets/{endpoint_suffix}/records"
    # Parameters for the API request
    params = {
        "select": variables_in_string,
        "limit": 100,  # in this API maximum limit is 100
        "offset": 0,  # we start from 0 to 100, then FROM 100 to 200 etc etc, but limit is fixed at 100, it is moving
        
    }
    params.update(kwargs)
    # Construct the full URL
    url = f"{base_url}{endpoint}"
    response = requests.get(url, params=params)
    wb = response.json()
    resulting_dictionnary = wb["results"].copy()
    while wb["results"] != []:
        params["offset"] = params["offset"] + 100
        response = requests.get(url, params=params)
        if response.status_code == 200:
            wb = response.json()
            for element in wb["results"]:
                resulting_dictionnary.append(element)
    # verify nb of observations
    print(f"nb of stations downloaded: {len(resulting_dictionnary)}")
    df = pd.json_normalize(resulting_dictionnary)
    return df
def get_absent_lat_lon_from_gouv_api(df):
    empties = df[df["lon"].isna()].copy()
    base_url = "https://api-adresse.data.gouv.fr/search/"
    # Paramètres de la requête
    params = {"q": "", "limit": 1}
    i = 0
    for idx, row in empties.iterrows():
        params["q"] = "gare de "+row["nom"]
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            coordinates = data["features"][0]["geometry"]["coordinates"]
            i = i + 1
        else:
            coordinates = [None, None]
        df.loc[idx, "lon"] = coordinates[0]
        df.loc[idx, "lat"] = coordinates[1]
    print(i, "absent addresses filled successfully")
    return df

In [None]:
df = get_names_geo_data_from_sncf_api(
    endpoint_suffix="gares-de-voyageurs",
    variables_in_string="nom,position_geographique,codeinsee,codes_uic",
)
df = df.drop("position_geographique", axis=1)

df.rename(
    columns={
        "position_geographique.lon": "lon",
        "position_geographique.lat": "lat",
    },
    inplace=True,
)


nb of stations downloaded: 2881


nom          0
codeinsee    0
codes_uic    0
lon          7
lat          7
dtype: int64

In [54]:
frequentation = get_names_geo_data_from_sncf_api(
    endpoint_suffix="frequentation-gares",
    variables_in_string="total_voyageurs_2023,code_postal,nom_gare as nom,code_uic_complet as codes_uic ",
)

nb of stations downloaded: 3010


In [109]:
property = get_names_geo_data_from_sncf_api(
    endpoint_suffix="proprete-en-gare",
    variables_in_string="nom_gare as nom ,uic as codes_uic ,avg(taux_de_conformite) as taux_de_conformite ",
    refine='mois:"2023"',
    group_by="nom_gare, uic "
    
)

nb of stations downloaded: 1465


In [110]:
property["codes_uic"]=property["uic"].str[2:]

In [112]:
property=property.drop(["nom_gare","uic"],axis=1)

In [113]:
property

Unnamed: 0,nom,codes_uic,taux_de_conformite
0,Abancourt,87313759,96.906971
1,Abbaretz,87481614,90.326087
2,Abbeville,87317362,96.531024
3,Agay,87757559,90.895105
4,Agde,87781278,93.897317
...,...,...,...
1460,Étival-Clairefontaine,87144642,90.224309
1461,Étriché - Châteauneuf,87484154,97.222222
1462,Évian-les-Bains,87745679,81.545884
1463,Évreux Normandie,87387001,94.598827


In [95]:
merged=frequentation.merge(df,on=["codes_uic","nom"],how="left")
merged=merged[merged["total_voyageurs_2023"]>0]
merged.isna().sum()

total_voyageurs_2023     0
code_postal              0
nom                      0
codes_uic                0
codeinsee               92
lon                     92
lat                     92
dtype: int64

In [115]:
merged.merge(property,on=["codes_uic","nom"],how="left")

Unnamed: 0,total_voyageurs_2023,code_postal,nom,codes_uic,codeinsee,lon,lat,taux_de_conformite
0,54862,44170,Abbaretz,87481614,44001,-1.524416,47.554643,90.326087
1,188426,78100,Achères Grand Cormier,87386052,78551,2.091903,48.955183,
2,82597,62121,Achiet-le-Grand,87342048,62005,2.780168,50.131752,
3,39364,83530,Agay,87757559,83118,6.856500,43.431370,90.895105
4,17034,30220,Aigues-Mortes,87775858,30003,4.191210,43.570901,97.643098
...,...,...,...,...,...,...,...,...
2931,189583,67720,Weyersheim,87213678,67529,7.797103,48.717790,96.428571
2932,41410,68760,Willer-sur-Thur,87182584,68372,7.069320,47.842549,
2933,35330,62126,Wimille - Wimereux,87317123,62894,1.613746,50.763846,
2934,43728,40100,Ygos-Saint-Saturnin,87671487,40333,-0.736153,43.978185,90.454545


In [56]:

merged=get_absent_lat_lon_from_gouv_api(merged)
merged.isna().sum()

92 absent addresses filled successfully


total_voyageurs_2023     0
code_postal              0
nom                      0
codes_uic                0
codeinsee               92
lon                      0
lat                      0
dtype: int64