### Importations des packages

In [1]:
import requests
import pandas as pd
import numpy as np 
import s3fs
from scipy.stats import zscore
import matplotlib.pyplot as plt
from tqdm import tqdm
from helpers import *
pd.set_option('display.float_format', '{:.2f}'.format)


### Data extraction from multiples files : API SNCF, API GOUV, economic regional data from web csv first use sncf well established api to access data of stations 

In [2]:
station_geo_data = get_names_geo_data_from_sncf_api(
    endpoint_suffix="gares-de-voyageurs",
    select="""nom as nom_gare,
                           position_geographique,
                           codeinsee,
                           codes_uic as uic""",
)

station_freq_data = get_names_geo_data_from_sncf_api(
    endpoint_suffix="frequentation-gares",
    select="""total_voyageurs_2022,
                           
                           nom_gare,
                           code_uic_complet as uic """,
)

nb of stations downloaded: 2881, from table gares-de-voyageurs
nb of stations downloaded: 3010, from table frequentation-gares


In [3]:
station_geo_data = station_geo_data.rename(
    columns={
        "position_geographique.lon": "lon_gare",
        "position_geographique.lat": "lat_gare",
    }
)
station_geo_data = station_geo_data.drop("position_geographique", axis=1)

In [4]:
merged = station_freq_data.merge(
    station_geo_data.drop("nom_gare", axis=1), on=["uic"], how="left"
)
merged = merged[merged["total_voyageurs_2022"] > 0]

### Importations des données communales 

In [5]:
s3=s3_connection()

connection successful


On importe les données communales récupérées sur le site "https://www.unehistoireduconflitpolitique.fr/telecharger.html". Nous avons des fichiers de données sur le taux de personnes diplômées par commune, la population et le revenu par habitant.

1. Revenus par habitant et population par commune

In [6]:
columns_to_select = ["codecommune", "nomcommune", "pop2022", "revmoy2022"]
dtype_spec = {'codecommune': 'str'}

path_revcommunes = "clichere/diffusion/revcommunes.csv"
revcommunes = s3.read_csv_from_s3(path_revcommunes, columns_to_select, dtype_spec)

print(revcommunes.head())

print(revcommunes.describe())

  codecommune             nomcommune  pop2022  revmoy2022
0       01001  ABERGEMENT-CLEMENCIAT      747    21615.65
1       01002    ABERGEMENT-DE-VAREY      288    16945.77
2       01004      AMBERIEU-EN-BUGEY    14375    18423.11
3       01005    AMBERIEUX-EN-DOMBES     1717    20996.12
4       01006                AMBLEON      116    18075.99
         pop2022  revmoy2022
count   36661.00    34746.00
mean     1925.01    17051.48
std     14952.15     5197.38
min         0.00     3350.48
25%       182.00    14012.09
50%       429.00    16322.69
75%      1118.00    19126.26
max   2118266.00   126903.91


2. Taux de diplome par commune

In [7]:
columns_to_select = ["codecommune","pbac2022", "psup2022"]
dtype_spec = {'codecommune': 'str'}

path_dipcommunes = "clichere/diffusion/diplomescommunes.csv"
dipcommunes = s3.read_csv_from_s3(path_dipcommunes, columns_to_select, dtype_spec)

print(dipcommunes.head())

print(dipcommunes.describe())

  codecommune  pbac2022  psup2022
0       01001      0.45      0.23
1       01002      0.67      0.09
2       01003       NaN       NaN
3       01004      0.46      0.29
4       01005      0.34      0.17
       pbac2022  psup2022
count  35222.00  35222.00
mean       0.46      0.27
std        0.20      0.17
min        0.00      0.00
25%        0.33      0.16
50%        0.45      0.26
75%        0.58      0.37
max        1.00      1.00


3. Catégories socio-professionnelles

In [8]:
columns_to_select = ["codecommune","pagri2022", "pempl2022", "pcadr2022", "pouvr2022", "pchom2022", "pindp2022"]
dtype_spec = {'codecommune': 'str'}

path_cspcommunes = "clichere/diffusion/cspcommunes.csv"
cspcommunes = s3.read_csv_from_s3(path_cspcommunes, columns_to_select, dtype_spec)

print(cspcommunes.head())

print(cspcommunes.describe())

  codecommune  pagri2022  pindp2022  pcadr2022  pempl2022  pouvr2022  \
0       01001       0.00       0.12       0.10       0.27       0.46   
1       01002       0.00       0.34       0.08       0.58       0.00   
2       01003        NaN        NaN        NaN        NaN        NaN   
3       01004       0.00       0.03       0.14       0.25       0.25   
4       01005       0.02       0.06       0.09       0.28       0.27   

   pchom2022  
0       0.08  
1       0.08  
2        NaN  
3       0.09  
4       0.03  
       pagri2022  pindp2022  pcadr2022  pempl2022  pouvr2022  pchom2022
count   35065.00   35065.00   35065.00   35065.00   35065.00   35065.00
mean        0.06       0.09       0.12       0.26       0.23       0.07
std         0.13       0.12       0.13       0.18       0.19       0.11
min         0.00       0.00       0.00       0.00       0.00       0.00
25%         0.00       0.00       0.00       0.13       0.08       0.00
50%         0.00       0.05       0.08       

4. Fusion et export des fichiers de données

In [9]:
intermediaire_comm = pd.merge(revcommunes, dipcommunes, on='codecommune', how='inner')

final_comm = pd.merge(intermediaire_comm, cspcommunes, on="codecommune", how="inner")

print(final_comm)

      codecommune             nomcommune  pop2022  revmoy2022  pbac2022  \
0           01001  ABERGEMENT-CLEMENCIAT      747    21615.65      0.45   
1           01002    ABERGEMENT-DE-VAREY      288    16945.77      0.67   
2           01004      AMBERIEU-EN-BUGEY    14375    18423.11      0.46   
3           01005    AMBERIEUX-EN-DOMBES     1717    20996.12      0.34   
4           01006                AMBLEON      116    18075.99      1.00   
...           ...                    ...      ...         ...       ...   
36627       95676     VILLERS-EN-ARTHIES      529    20655.25      0.53   
36628       95678          VILLIERS-ADAM      927    26113.93      0.89   
36629       95680        VILLIERS-LE-BEL    27033    11837.68      0.44   
36630       95682        VILLIERS-LE-SEC      248    18516.31      0.27   
36631       95690    WY-DIT-JOLY-VILLAGE      375    25304.95      0.87   

       psup2022  pagri2022  pindp2022  pcadr2022  pempl2022  pouvr2022  \
0          0.23       0.0

In [17]:
output_path1 = 'clichere/diffusion/final_comm.parquet'
output_path2 = 'clichere/diffusion/revcommunes.parquet'
output_path3 = 'clichere/diffusion/dipcommunes.parquet'
output_path4 = 'clichere/diffusion/cspcommunes.parquet'

s3.from_pandas_to_parquet_store_in_s3(final_comm, output_path1)
s3.from_pandas_to_parquet_store_in_s3(revcommunes, output_path2)
s3.from_pandas_to_parquet_store_in_s3(dipcommunes, output_path3)
s3.from_pandas_to_parquet_store_in_s3(cspcommunes, output_path4)


### Récupération des données communales

here insert code from communes and delete extract from S3!!!!!!!!!!!!!

get stat about regions

In [10]:
regional_stat=s3.get_tables_from_s3("clichere/diffusion/final_comm.parquet")

In [11]:
regional_stat

Unnamed: 0,codecommune,nomcommune,pop2022,revmoy2022,pbac2022,psup2022,pagri2022,pindp2022,pcadr2022,pempl2022,pouvr2022,pchom2022
0,01001,ABERGEMENT-CLEMENCIAT,747,21615.65,0.45,0.23,0.00,0.12,0.10,0.27,0.46,0.08
1,01002,ABERGEMENT-DE-VAREY,288,16945.77,0.67,0.09,0.00,0.34,0.08,0.58,0.00,0.08
2,01004,AMBERIEU-EN-BUGEY,14375,18423.11,0.46,0.29,0.00,0.03,0.14,0.25,0.25,0.09
3,01005,AMBERIEUX-EN-DOMBES,1717,20996.12,0.34,0.17,0.02,0.06,0.09,0.28,0.27,0.03
4,01006,AMBLEON,116,18075.99,1.00,0.01,0.00,0.00,0.00,0.29,0.29,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...
36627,95676,VILLERS-EN-ARTHIES,529,20655.25,0.53,0.51,0.00,0.19,0.21,0.25,0.01,0.10
36628,95678,VILLIERS-ADAM,927,26113.93,0.89,0.67,0.00,0.07,0.45,0.13,0.09,0.05
36629,95680,VILLIERS-LE-BEL,27033,11837.68,0.44,0.23,0.00,0.07,0.05,0.38,0.28,0.13
36630,95682,VILLIERS-LE-SEC,248,18516.31,0.27,0.09,0.00,0.09,0.25,0.00,0.03,0.04


In [12]:
regional_stat = regional_stat[
    (regional_stat["pop2022"] > 0)
    & (regional_stat["revmoy2022"] > 0)
    & (regional_stat["pbac2022"].notna())
]

In [13]:
fusion=merged.merge(regional_stat,left_on="codeinsee",right_on="codecommune",how="left")

In [14]:
fusion.sample(2)

Unnamed: 0,total_voyageurs_2022,nom_gare,uic,codeinsee,lon_gare,lat_gare,codecommune,nomcommune,pop2022,revmoy2022,pbac2022,psup2022,pagri2022,pindp2022,pcadr2022,pempl2022,pouvr2022,pchom2022
2377,605331,Belfort - Montbéliard TGV,87300822,90068,6.9,47.59,90068,MEROUX,886.0,31680.6,0.62,0.41,0.0,0.14,0.15,0.15,0.38,0.03
192,3356330,Herblay,87381889,95306,2.16,48.99,95306,HERBLAY,32231.0,20781.98,0.57,0.41,0.0,0.06,0.28,0.23,0.14,0.05


In [15]:
final=fusion[fusion["pop2022"].notna()]

In [16]:
cross_table_for_distance_calculation = final[["uic", "lon_gare", "lat_gare"]].merge(
    final[["uic", "lon_gare", "lat_gare"]], how="cross"
)
(
    lat1,
    lon1,
    lat2,
    lon2,
) = (
    cross_table_for_distance_calculation["lat_gare_x"],
    cross_table_for_distance_calculation["lon_gare_x"],
    cross_table_for_distance_calculation["lat_gare_y"],
    cross_table_for_distance_calculation["lon_gare_y"],
)
cross_table_for_distance_calculation["dist_closest_station_km"] = haversine_vectorized(
    lat1, lon1, lat2, lon2
)

In [17]:
cross_table_for_distance_calculation = cross_table_for_distance_calculation[
    cross_table_for_distance_calculation["uic_x"]
    != cross_table_for_distance_calculation["uic_y"]
]

In [18]:
idx = cross_table_for_distance_calculation.groupby(["uic_x"])[
    "dist_closest_station_km"
].idxmin()

result = cross_table_for_distance_calculation.loc[idx].reset_index(drop=True)

In [19]:
final = final.merge(
    result[["uic_x", "dist_closest_station_km"]].rename(columns={"uic_x": "uic"}),
    on="uic",
    how="inner",
)

In [20]:

new=final.groupby('nomcommune').agg({'nomcommune': ['count']})
new["commune"]=new.index
new=new.reset_index(drop=True)
new.columns = ["nb_stations_same_commune","nomcommune"]

In [21]:
final=final.merge(new,on="nomcommune",how="inner")

In [22]:
french_regions=gouv_api_addresses(final)

In [23]:
final["regions"]=french_regions

In [24]:
final["regions"].unique()

array(['Pays de la Loire', 'Île-de-France', 'Hauts-de-France',
       "Provence-Alpes-Côte d'Azur", 'Occitanie', 'Auvergne-Rhône-Alpes',
       'Bourgogne-Franche-Comté', 'Centre-Val de Loire', 'Normandie',
       'Nouvelle-Aquitaine', 'Grand Est', 'Bretagne'], dtype=object)

In [25]:
final.isna().sum()

total_voyageurs_2022        0
nom_gare                    0
uic                         0
codeinsee                   0
lon_gare                    0
lat_gare                    0
codecommune                 0
nomcommune                  0
pop2022                     0
revmoy2022                  0
pbac2022                    0
psup2022                    0
pagri2022                   0
pindp2022                   0
pcadr2022                   0
pempl2022                   0
pouvr2022                   0
pchom2022                   0
dist_closest_station_km     0
nb_stations_same_commune    0
regions                     0
dtype: int64

In [26]:
final.sample(2)

Unnamed: 0,total_voyageurs_2022,nom_gare,uic,codeinsee,lon_gare,lat_gare,codecommune,nomcommune,pop2022,revmoy2022,...,psup2022,pagri2022,pindp2022,pcadr2022,pempl2022,pouvr2022,pchom2022,dist_closest_station_km,nb_stations_same_commune,regions
1862,3183,Ennevelin,87287276,59197,3.16,50.55,59197,ENNEVELIN,2296.0,27796.37,...,0.5,0.05,0.06,0.13,0.2,0.0,0.05,1.35,1,Hauts-de-France
377,35454,Pont-de-Buis,87474056,29302,-4.09,48.25,29302,PONT-DE-BUIS-LES-QUIMERC,3755.0,15216.62,...,0.28,0.01,0.05,0.05,0.2,0.37,0.04,5.9,1,Bretagne


In [27]:
s3.from_pandas_to_parquet_store_in_s3(final,"aayrapetyan/diffusion/final_table.parquet")

PermissionError: Access Denied.

In [28]:
final=s3.get_tables_from_s3("aayrapetyan/diffusion/final_table.parquet")
final

Unnamed: 0,total_voyageurs_2022,nom_gare,uic,codeinsee,lon_gare,lat_gare,codecommune,nomcommune,pop2022,revmoy2022,...,psup2022,pagri2022,pindp2022,pcadr2022,pempl2022,pouvr2022,pchom2022,dist_closest_station_km,nb_stations_same_commune,regions
0,40825,Abbaretz,87481614,44001,-1.52,47.55,44001,ABBARETZ,2257.00,12567.25,...,0.17,0.06,0.02,0.05,0.31,0.42,0.04,9.75,1,Pays de la Loire
1,177092,Achères Grand Cormier,87386052,78551,2.09,48.96,78551,SAINT-GERMAIN-EN-LAYE,39172.00,41601.99,...,0.64,0.00,0.04,0.39,0.23,0.07,0.06,1.97,1,Île-de-France
2,80648,Achiet-le-Grand,87342048,62005,2.78,50.13,62005,ACHIET-LE-GRAND,888.00,15813.03,...,0.34,0.00,0.00,0.11,0.16,0.38,0.08,4.22,1,Hauts-de-France
3,32800,Agay,87757559,83118,6.86,43.43,83118,SAINT-RAPHAEL,37114.00,26637.97,...,0.34,0.00,0.13,0.13,0.35,0.13,0.10,1.80,6,Provence-Alpes-Côte d'Azur
4,11325,Aigues-Mortes,87775858,30003,4.19,43.57,30003,AIGUES-MORTES,8076.00,20798.90,...,0.20,0.01,0.14,0.08,0.39,0.17,0.11,5.56,1,Occitanie
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2803,176599,Weyersheim,87213678,67529,7.80,48.72,67529,WEYERSHEIM,3365.00,21737.75,...,0.26,0.01,0.15,0.23,0.22,0.07,0.03,2.10,1,Grand Est
2804,30849,Willer-sur-Thur,87182584,68372,7.07,47.84,68372,WILLER-SUR-THUR,1735.00,18620.66,...,0.13,0.00,0.09,0.09,0.24,0.34,0.04,1.82,1,Grand Est
2805,37020,Wimille - Wimereux,87317123,62894,1.61,50.76,62894,WIMILLE,4062.00,19524.12,...,0.28,0.01,0.07,0.15,0.21,0.30,0.06,4.08,1,Hauts-de-France
2806,38618,Ygos-Saint-Saturnin,87671487,40333,-0.74,43.98,40333,YGOS-SAINT-SATURNIN,1490.00,15004.68,...,0.28,0.00,0.08,0.11,0.33,0.32,0.09,6.68,1,Nouvelle-Aquitaine
