In [14]:
import numpy as np
import pandas as pd
import geopandas as gpd
import plotly.express as px
import folium

# Exploration de la liste des gares

In [15]:
gares = gpd.read_file("../data/raw/liste-des-gares.geojson")
print(gares.shape)
gares.head()

(3884, 18)


Unnamed: 0,code_uic,libelle,fret,voyageurs,code_ligne,rg_troncon,pk,commune,departemen,idreseau,idgaia,x_l93,y_l93,x_wgs84,y_wgs84,c_geo,geo_point_2d,geometry
0,87009696,La Douzillère,N,O,594000,1,244+100,JOUE-LES-TOURS,INDRE-ET-LOIRE,4650,d9dc0092-6667-11e3-89ff-01f464e0362d,522803.9864,6695782.0,0.653001,47.338661,"{ ""lon"": 0.65300138668248875, ""lat"": 47.338661...","{ ""lon"": 0.65300138668248675, ""lat"": 47.338661...",POINT (0.653 47.33866)
1,87142554,Châtillon-sur-Seine,O,N,839000,1,035+431,SAINTE-COLOMBE-SUR-SEINE,COTE-D'OR,9201,29d3de32-dfbc-11e3-a2ff-01a464e0362d,815990.5833,6753363.0,4.551565,47.870404,"{ ""lon"": 4.5515651191521753, ""lat"": 47.8704042...","{ ""lon"": 4.5515651191521762, ""lat"": 47.8704042...",POINT (4.55157 47.8704)
2,87382218,La Défense,N,O,973000,1,008+295,PUTEAUX,HAUTS-DE-SEINE,4648,c0d4c69a-f312-11e3-90ff-015864e0362d,644164.1873,6866206.0,2.238472,48.893437,"{ ""lon"": 2.2384716845345993, ""lat"": 48.8934372...","{ ""lon"": 2.2384716845345984, ""lat"": 48.8934372...",POINT (2.23847 48.89344)
3,87718122,Byans,N,O,871000,1,015+118,BYANS-SUR-DOUBS,DOUBS,3446,297c8c1e-dfbc-11e3-a2ff-01a464e0362d,916198.4419,6672579.0,5.852088,47.118329,"{ ""lon"": 5.8520880691962924, ""lat"": 47.1183292...","{ ""lon"": 5.8520880691962951, ""lat"": 47.1183292...",POINT (5.85209 47.11833)
4,87721829,Chamelet,N,O,775000,1,074+576,CHAMELET,RHONE,3578,d9cff2d6-6667-11e3-89ff-01f464e0362d,816664.1967,6543552.0,4.507016,45.98167,"{ ""lon"": 4.5070162445435979, ""lat"": 45.9816702...","{ ""lon"": 4.5070162445435997, ""lat"": 45.9816702...",POINT (4.50702 45.98167)


# Filtrage de la donnée

Sur ce projet, on ne s'intéresse qu'aux gares de voyageurs, donc on peut supprimer toutes les gares qui n'en font pas partie

In [11]:
def filter_voyageurs(gares_df):
    gares_filtered_df = gares_df.query("voyageurs == 'O'")
    gares_filtered_df = gares_filtered_df.drop(columns=["voyageurs"])
    gares_filtered_df = gares_filtered_df.reset_index(drop=True)
    return gares_filtered_df

filtered_gares = filter_voyageurs(gares)
print(filtered_gares.shape)
filtered_gares.head()

(3352, 17)


Unnamed: 0,code_uic,libelle,fret,code_ligne,rg_troncon,pk,commune,departemen,idreseau,idgaia,x_l93,y_l93,x_wgs84,y_wgs84,c_geo,geo_point_2d,geometry
0,87009696,La Douzillère,N,594000,1,244+100,JOUE-LES-TOURS,INDRE-ET-LOIRE,4650,d9dc0092-6667-11e3-89ff-01f464e0362d,522803.9864,6695782.0,0.653001,47.338661,"{ ""lon"": 0.65300138668248875, ""lat"": 47.338661...","{ ""lon"": 0.65300138668248675, ""lat"": 47.338661...",POINT (0.653 47.33866)
1,87382218,La Défense,N,973000,1,008+295,PUTEAUX,HAUTS-DE-SEINE,4648,c0d4c69a-f312-11e3-90ff-015864e0362d,644164.1873,6866206.0,2.238472,48.893437,"{ ""lon"": 2.2384716845345993, ""lat"": 48.8934372...","{ ""lon"": 2.2384716845345984, ""lat"": 48.8934372...",POINT (2.23847 48.89344)
2,87718122,Byans,N,871000,1,015+118,BYANS-SUR-DOUBS,DOUBS,3446,297c8c1e-dfbc-11e3-a2ff-01a464e0362d,916198.4419,6672579.0,5.852088,47.118329,"{ ""lon"": 5.8520880691962924, ""lat"": 47.1183292...","{ ""lon"": 5.8520880691962951, ""lat"": 47.1183292...",POINT (5.85209 47.11833)
3,87721829,Chamelet,N,775000,1,074+576,CHAMELET,RHONE,3578,d9cff2d6-6667-11e3-89ff-01f464e0362d,816664.1967,6543552.0,4.507016,45.98167,"{ ""lon"": 4.5070162445435979, ""lat"": 45.9816702...","{ ""lon"": 4.5070162445435997, ""lat"": 45.9816702...",POINT (4.50702 45.98167)
4,87471060,L'Hermitage-Mordelles,O,420000,1,385+104,L'HERMITAGE,ILLE-ET-VILAINE,8893,e1934ea8-19bf-11e5-a6ff-01fc64e0362d,341644.2469,6791268.0,-1.819212,48.123336,"{ ""lon"": -1.8192115152677604, ""lat"": 48.123335...","{ ""lon"": -1.8192115152677635, ""lat"": 48.123335...",POINT (-1.81921 48.12334)


## Réduction de la donnée

### ```fret``` et ```voyageurs```

On a déjà vu que l'on avait filtré pour n'avoir que les gares de voyageurs. Donc on peut supprimer la colonne ```voyageurs```.

Pour ```fret```, il faut examiner si ```voyageurs``` ne contient pas exactement la même information, autrement dit, si une gare de fret ne peut pas être une gare de voyageurs, et réciproquement.

In [12]:
print(pd.crosstab(
    gares["fret"],
    gares["voyageurs"],
))

voyageurs    N     O
fret                
N            0  2280
O          532  1072


Or, il arrive qu'on aie des gares qui sont à la fois des gares de fret et des gares de voyageurs. On garde donc cette information qui peut se révéler pertinente. On devra néanmoins changer les "O" et "N" par des valeurs booléennes.

### ```rg_troncon```, ```x_l93```, ```y_l93```, ```x_wgs84```, ```y_wgs84```, ```geo_point_2d```

Pour les mêmes raisons que dans ```speeds.ipynb```, on peut supprimer ces colonnes.

### ```idgaia``` et ```idreseau```

Ce sont des colonnes qui n'aparraissent nulle part ailleurs dans les données, donc on les supprime. De plus, on a déjà ```code_uic```qui peut servir de clé primaire.

### ```code_uic```

Ce sera la clé privée que nous utiliserons. Il y a des duplicata cependant, main il semble qu'ils désignent une seule et même gare. On va donc les supprimer.

In [18]:
print(gares["code_uic"].duplicated().sum())

gares[gares["code_uic"] == "87215335"]

414


Unnamed: 0,code_uic,libelle,fret,voyageurs,code_ligne,rg_troncon,pk,commune,departemen,idreseau,idgaia,x_l93,y_l93,x_wgs84,y_wgs84,c_geo,geo_point_2d,geometry
2444,87215335,Berthelming,N,O,140000,1,078+067,BERTHELMING,MOSELLE,3195,d9a58cd6-6667-11e3-89ff-01f464e0362d,994288.302,6864055.0,7.008898,48.809317,"{ ""lon"": 7.0088979027333949, ""lat"": 48.8093172...","{ ""lon"": 7.0088979027333993, ""lat"": 48.8093172...",POINT (7.0089 48.80932)
3863,87215335,Berthelming,N,O,168000,1,000+000,BERTHELMING,MOSELLE,3195,d9a58cd6-6667-11e3-89ff-01f464e0362d,994256.629,6864196.0,7.008565,48.810599,"{ ""lon"": 7.008564707523945, ""lat"": 48.81059944...","{ ""lon"": 7.0085647075239477, ""lat"": 48.8105994...",POINT (7.00856 48.8106)


In [17]:
def process_gares(gares_df):
    gares_processed_df = gares_df.query("voyageurs == 'O'")
    gares_processed_df = gares_processed_df.drop(columns=["voyageurs"])
    gares_processed_df = gares_processed_df.reset_index(drop=True)
    gares_processed_df["fret"] = gares_processed_df["fret"].apply(lambda x: x == "O")
    # On ne garde que les colonnes qui nous intéressent
    relevant_columns = ["code_uic", "libelle", "fret", "code_ligne", "geometry"]
    gares_processed_df = gares_processed_df[relevant_columns]
    gares_processed_df = gares_processed_df.drop_duplicates(subset=["code_uic"])
    
    return gares_processed_df

processed_gares = process_gares(gares)
print(processed_gares.shape)
processed_gares.head()

(2974, 5)


Unnamed: 0,code_uic,libelle,fret,code_ligne,geometry
0,87009696,La Douzillère,False,594000,POINT (0.653 47.33866)
1,87382218,La Défense,False,973000,POINT (2.23847 48.89344)
2,87718122,Byans,False,871000,POINT (5.85209 47.11833)
3,87721829,Chamelet,False,775000,POINT (4.50702 45.98167)
4,87471060,L'Hermitage-Mordelles,True,420000,POINT (-1.81921 48.12334)
