In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import plotly.express as px

# Exploration des fréquentations dees gares

In [2]:
frequentations = pd.read_csv("../data/frequentation-gares.csv",sep=";")
print(frequentations.shape)
frequentations.head()

(3010, 22)


Unnamed: 0,Nom de la gare,Code UIC,Code postal,Segmentation DRG,Total Voyageurs 2023,Total Voyageurs + Non voyageurs 2023,Total Voyageurs 2022,Total Voyageurs + Non voyageurs 2022,Total Voyageurs 2021,Total Voyageurs + Non voyageurs 2021,...,Total Voyageurs 2019,Total Voyageurs + Non voyageurs 2019,Total Voyageurs 2018,Total Voyageurs + Non voyageurs 2018,Total Voyageurs 2017,Total Voyageurs + Non voyageurs 2017,Total Voyageurs 2016,Total Voyageurs + Non voyageurs 2016,Total Voyageurs 2015,Total Voyageurs + Non voyageurs 2015
0,Acheux - Franleu,87316745,80560,C,0,0,2,2,0,0,...,2,2,235,235,104,104,308,308,898,898
1,Aiguebelette-le-Lac,87741421,73610,C,4394,4394,4046,4046,3658,3658,...,4754,4754,2535,2535,3117,3117,3912,3912,4071,4071
2,Aigueperse,87734129,63260,C,86794,86794,70599,70599,55143,55143,...,75954,75954,70745,70745,77218,77218,78088,78088,75873,75873
3,Albi,87615005,81000,B,568827,711034,531922,664902,391271,489089,...,415630,519537,368147,460184,432819,541024,392651,490814,412872,516090
4,Albi Madeleine,87615195,81000,C,89396,89396,80495,80495,57591,57591,...,51855,51855,44787,44787,55019,55019,52084,52084,63723,63723


In [3]:
print(frequentations.head())

        Nom de la gare  Code UIC  Code postal Segmentation DRG  \
0     Acheux - Franleu  87316745        80560                C   
1  Aiguebelette-le-Lac  87741421        73610                C   
2           Aigueperse  87734129        63260                C   
3                 Albi  87615005        81000                B   
4       Albi Madeleine  87615195        81000                C   

   Total Voyageurs 2023  Total Voyageurs + Non voyageurs 2023  \
0                     0                                     0   
1                  4394                                  4394   
2                 86794                                 86794   
3                568827                                711034   
4                 89396                                 89396   

   Total Voyageurs 2022  Total Voyageurs + Non voyageurs 2022  \
0                     2                                     2   
1                  4046                                  4046   
2                

In [4]:
list(frequentations.columns)

['Nom de la gare',
 'Code UIC',
 'Code postal',
 'Segmentation DRG',
 'Total Voyageurs 2023',
 'Total Voyageurs + Non voyageurs 2023',
 'Total Voyageurs 2022',
 'Total Voyageurs + Non voyageurs 2022',
 'Total Voyageurs 2021',
 'Total Voyageurs + Non voyageurs 2021',
 'Total Voyageurs 2020',
 'Total Voyageurs + Non voyageurs 2020',
 'Total Voyageurs 2019',
 'Total Voyageurs + Non voyageurs 2019',
 'Total Voyageurs 2018',
 'Total Voyageurs + Non voyageurs 2018',
 'Total Voyageurs 2017',
 'Total Voyageurs + Non voyageurs 2017',
 'Total Voyageurs 2016',
 'Total Voyageurs + Non voyageurs 2016',
 'Total Voyageurs 2015',
 'Total Voyageurs + Non voyageurs 2015']

## Transformation de la donnée

On remarque que les données sont présentées de la façon suivante:

| Nom                 | Code Postal | Total Voyageurs 2023 | Total Voyageurs 2022 | Total Voyageurs 2021 |
|---------------------|-------------|----------------------|----------------------|----------------------|
| Aiguebelette-le-Lac | 73610       | 4394                 | 4046                 | 3658                 |
| Aigueperse          | 63260       | 86794                | 70599                | 55143                |
| ...                 | ...         | ...                  | ...                  | ...                  |

Ce format n'est pas idéal, car il multiplie le nombre de colonnes si on ajoute des années. Ainsi, il rend compliqué les comparaisons d'une année à une autre.

Il est donc préférable de procéder ainsi : 

| Nom                 | Code Postal | Année | Total Voyageurs      |
|---------------------|-------------|-------|----------------------|
| Aiguebelette-le-Lac | 73610       | 2023  | 4394                 |
| Aiguebelette-le-Lac | 73610       | 2022  | 4046                 |
| Aiguebelette-le-Lac | 73610       | 2020  | 3658                 |
| Aigueperse          | 63260       | 2023  | 86794                |
| Aigueperse          | 63260       | 2022  | 70599                |
| Aigueperse          | 63260       | 2020  | 55143                |
| ...                 | ...         | ...   | ...                  |

On ajoute alors un certain nombre de lignes, mais on rend la donnée beaucoup plus lisible.

In [18]:
def set_correct_columns(frequentations_df):
    years = [str(year) for year in range(2015, 2024)]
    transformed_df = pd.DataFrame()
    for year in years:
        year_df = frequentations_df[["Nom de la gare", "Code UIC", "Code postal", "Segmentation DRG", f"Total Voyageurs {year}", f"Total Voyageurs + Non voyageurs {year}"]]
        year_df.loc[:,"Année"] = year
        year_df = year_df.rename(columns={f"Total Voyageurs {year}":"Total Voyageurs", f"Total Voyageurs + Non voyageurs {year}":"Total Voyageurs + Non Voyageurs"})
        transformed_df = pd.concat([transformed_df, year_df])
    transformed_df = transformed_df.sort_values(by=["Nom de la gare", "Année"])
    transformed_df = transformed_df.reset_index(drop=True)
    return transformed_df
        
    
set_correct_columns(frequentations)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df.loc[:,"Année"] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df.loc[:,"Année"] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df.loc[:,"Année"] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value 

Unnamed: 0,Nom de la gare,Code UIC,Code postal,Segmentation DRG,Total Voyageurs,Total Voyageurs + Non Voyageurs,Année
0,Abancourt,87313759,60220,C,39720,39720,2015
1,Abancourt,87313759,60220,C,41096,41096,2016
2,Abancourt,87313759,60220,C,43760,43760,2017
3,Abancourt,87313759,60220,C,40228,40228,2018
4,Abancourt,87313759,60220,C,42685,42685,2019
...,...,...,...,...,...,...,...
27085,Évry-Courcouronnes,87681387,91000,B,11258509,11258509,2019
27086,Évry-Courcouronnes,87681387,91000,B,5130572,5130572,2020
27087,Évry-Courcouronnes,87681387,91000,B,8307516,8307516,2021
27088,Évry-Courcouronnes,87681387,91000,B,9299239,9299239,2022
