In [2]:
import requests
import pandas as pd
import numpy as np 


In [3]:
def get_names_geo_data_from_sncf_api():
    base_url = "https://ressources.data.sncf.com"
    # Endpoint for the desired dataset
    endpoint = "/api/explore/v2.1/catalog/datasets/gares-de-voyageurs/records"
    # Parameters for the API request
    params = {
        "select": "nom,position_geographique,codeinsee",
        "limit": 100,  # in this API maximum limit is 100
        "offset": 0,  # we start from 0 to 100, then FROM 100 to 200 etc etc, but limit is fixed at 100, it is moving
    }
    # Construct the full URL
    url = f"{base_url}{endpoint}"
    response = requests.get(url, params=params)
    wb = response.json()
    resulting_dictionnary = wb["results"].copy()
    while wb["results"] != []:
        params["offset"] = params["offset"] + 100
        response = requests.get(url, params=params)
        if response.status_code == 200:
            wb = response.json()
            for element in wb["results"]:
                resulting_dictionnary.append(element)
    # verify nb of observations
    print(f"nb of stations downloaded: {len(resulting_dictionnary)}")
    df = pd.json_normalize(resulting_dictionnary).drop("position_geographique", axis=1)
    df.rename(
        columns={
            "position_geographique.lon": "lon",
            "position_geographique.lat": "lat",
        },
        inplace=True,
    )
    return df
def get_absent_lat_lon_from_gouv_api(df):
    empties = df[df["lon"].isna()].copy()
    base_url = "https://api-adresse.data.gouv.fr/search/"
    # Paramètres de la requête
    params = {"q": "", "limit": 1}
    i = 0
    for idx, row in empties.iterrows():
        params["q"] = row["nom"]
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            coordinates = data["features"][0]["geometry"]["coordinates"]
            i = i + 1
        else:
            coordinates = [None, None]
        df.loc[idx, "lon"] = coordinates[0]
        df.loc[idx, "lat"] = coordinates[1]
    print(i, "absent addresses filled successfully")
    return df

In [4]:
df=get_names_geo_data_from_sncf_api()
df.isna().sum()

nb of stations downloaded: 2881


nom          0
codeinsee    0
lon          7
lat          7
dtype: int64

In [5]:
df=get_absent_lat_lon_from_gouv_api(df)
df.isna().sum()

7 absent addresses filled successfully


nom          0
codeinsee    0
lon          0
lat          0
dtype: int64

In [6]:
df

Unnamed: 0,nom,codeinsee,lon,lat
0,Châteauneuf - Bujaleuf,87105,1.645303,45.759316
1,Cinq-Mars-la-Pile,37077,0.460488,47.345226
2,Bécon les Bruyères,92026,2.268380,48.905920
3,Cousance,39173,5.386822,46.535357
4,Culmont - Chalindrey,52155,5.443222,47.810090
...,...,...,...,...
2876,Messein,54366,6.137836,48.612879
2877,Port-la-Nouvelle,11266,3.038819,43.019934
2878,Pierre-Bénite,69152,4.824506,45.706677
2879,Caffiers,62191,1.812034,50.850068
