In [311]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd

from shapely.geometry import Point

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.preprocessing import Normalizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score
from sklearn.inspection import permutation_importance
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.linear_model import LogisticRegression

#import xgboost as xgb
#from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier, plot_importance

import warnings
from warnings import simplefilter
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [3]:
class Config:
    PATH = '../../data/'

class Params:
    PARAM = 0

In [4]:
config = Config()
p = Params()

In [5]:
df = pd.read_csv(config.PATH+'E-tmja-2019-clean.csv')

In [6]:
columns_to_drop = ['dateReferentiel','zD','zF']
df = df.drop(columns=columns_to_drop)

In [7]:
import pyproj

def add_lat_lon_columns(df):
    """
    Adds new columns 'lonD', 'latD', 'lonF', 'latF' to the dataframe with
    corresponding latitude and longitude values based on the 'xD', 'yD', 'xF',
    and 'yF' columns, which are in Lambert-93 projection.

    Parameters:
    -----------
    df : pandas.DataFrame
        The dataframe containing the columns 'xD', 'yD', 'xF', and 'yF'.

    Returns:
    --------
    pandas.DataFrame
        The original dataframe with the new 'lonD', 'latD', 'lonF', and 'latF'
        columns added.
    """
    # Define the input and output projections
    in_proj = pyproj.Proj(init='epsg:2154')  # Lambert-93
    out_proj = pyproj.Proj(init='epsg:4326')  # WGS84

    df['xD'] = df['xD'].str.replace(',', '.')
    df['yD'] = df['yD'].str.replace(',', '.')
    df['xF'] = df['xF'].str.replace(',', '.')
    df['yF'] = df['yF'].str.replace(',', '.')

    # Convert start coordinates to lat-long
    df['lonD'], df['latD'] = pyproj.transform(in_proj, out_proj, df['xD'], df['yD'])

    # Convert end coordinates to lat-long
    df['lonF'], df['latF'] = pyproj.transform(in_proj, out_proj, df['xF'], df['yF'])

    return df

In [8]:
df = add_lat_lon_columns(df)



In [310]:
df.head()

Unnamed: 0,route,longueur,prD,depPrD,concessionPrD,absD,cumulD,xD,yD,prF,...,yF,anneeMesureTrafic,typeComptageTrafic,typeComptageTrafic_lib,TMJA,ratio_PL,lonD,latD,lonF,latF
0,31D0044,44,0,31,N,0,0,511656.78,6204078.36,0,...,6204063.93,,,,,,0.695459,42.911382,0.695975,42.911263
1,31D0044E,762,0,31,N,0,0,511367.22,6204210.06,0,...,6203732.4,,,,,,0.691871,42.91249,0.698322,42.908331
2,69D0301,6055,2,69,N,-981,0,844036.84,6510806.45,7,...,6509342.34,,,,,,4.850575,45.681629,4.918315,45.667307
3,69D0383,13752,4,69,N,-494,0,846831.3,6522372.16,17,...,6512429.8,,,,,,4.89002,45.785151,4.844653,45.696348
4,69D0383BPNL,2408,0,69,N,0,0,844982.46,6522772.27,2,...,6522372.16,,,,,,4.866352,45.789149,4.89002,45.785151


### Clean data

In [14]:
missing_values = df.isna().sum().sort_values(ascending=False)
missing_values[:6]

ratio_PL                  1634
TMJA                       954
typeComptageTrafic_lib     417
typeComptageTrafic         417
anneeMesureTrafic          138
route                        0
dtype: int64

In [11]:
import missingno as msno

# keep only columns with nan values
columns_with_nan = missing_values[missing_values.values!=0].index
msno.matrix(df[columns_with_nan])

<AxesSubplot:>

### Données stations


In [16]:
df_stations = pd.read_csv(config.PATH+'I-Données de stations TE_DV.xlsx - export_data_te.csv')

In [18]:
df_stations.head()

Unnamed: 0,URL,Station de service,Ville,Adresse,Coordinates,Energies,Lavage,Paiement,Produits & Services,Services,Club TotalEnergies,Truck,H2 Conversion
0,https://store.totalenergies.fr/fr_FR/NF059116,RELAIS SERIGUETTE,AIMARGUES,"2 bis. ROUTE DE LA PETITE CAMARGUE , 30470 AI...","43.6961378,4.2007447","AdBlue Truck, Excellium 98, Excellium Diesel, ...",,"Routex, UTA, DKV, Mobility de TotalEnergies, M...","AdBlue Truck,","Café, Justbip, Wifi, AdBlue bidon,","Assistance Club 10 Jours, Jauge Cadeau, Cagnot...","Truck, Truck store,",0.0
1,https://store.totalenergies.fr/fr_FR/NF080036,RELAIS DEVEZE,BEZIERS,"1 AVENUE DE LA DEVEZE , 34500 BEZIERS, FRANCE","43.33150089999999,3.2474023","AdBlue Truck, Super Ethanol e85, Excellium 98,...","Lavage Haute Pression, Aspirateur, Lavage à Ro...","UTA, Carte Jubileo, MOL, Carte Wash, American ...","AdBlue Truck,","FDJ, Café, Lavage à Rouleaux, Justbip, Gonflag...","Assistance Club 10 Jours, Jauge Cadeau, Cagnot...","Truck, Truck store,",0.0
2,https://store.totalenergies.fr/fr_FR/NF080198,RELAIS SAINT MATHURIN,ALLONNE,"ZAC SAINT MATHURIN - RD 1001 , 60000 ALLONNE,...","49.409445,2.12815","Borne DC175 kW, AdBlue Truck, Excellium 98, Ex...","Aspirateur, Lavage à Rouleaux,","Visa, American Express, Mastercard, Eurotrafic...","AdBlue Truck,","Café, Lavage à Rouleaux, Gonflage, Wifi, Justb...","Assistance Club 10 Jours, Jauge Cadeau, Cagnot...","Truck, Pompes PL,",1.0
3,https://store.totalenergies.fr/fr_FR/NF080197,REL.LORRAINE LES RAPPES,SANDAUCOURT,"AUT.A31-LORRAINE LES RAPPES , 88170 SANDAUCOU...","48.2659392,5.863884","Borne AC43/DC50 kW, Borne DC175 kW, SP95 e10, ...",,"Mobility de TotalEnergies, Carte Jubileo, Cart...","AdBlue Truck,","Défibrillateur, Café, Wifi, Autoroutière, Toil...","Assistance Club 30 Jours, Club Truck, Cagnotte...","Truck, Pompes PL,",0.0
4,https://store.totalenergies.fr/fr_FR/NF058916,REL.MORAINVILLIERS NORD,MORAINVILLIERS,"A13 - AIRE DE MORAINVILLIERS NORD , 78630 MOR...","48.9411429,1.9543387","Diesel, GPL, SP95 e10, Excellium 98, GNR, Exce...",,"Mobility de TotalEnergies, MOL, American Expre...","AdBlue Truck,","Click & Collect Lyf, Toilettes, Défibrillateur...","Assistance Club 30 Jours, Club Truck, Cagnotte...","Truck, Parking PL, Truck store,",0.0


In [332]:
def create_coordinate_columns(
        df_stations: pd.DataFrame
):
    #create Latitude and Longitude coordinates 
    df_stations['Latitude'] = df_stations['Coordinates'].apply(lambda x: x.split(',')[0].strip())
    df_stations['Longitude'] = df_stations['Coordinates'].apply(lambda x: x.split(',')[1].strip())

    #filter out the empty coordinates
    df_stations = df_stations[(df_stations['Longitude'] != '')&(df_stations['Latitude'] != '')]

    return df_stations

In [None]:
df_stations = create_coordinate_columns(df_stations)

### Geopandas


In [327]:
df_shp = gpd.read_file(config.PATH+'E-tmja2019-shp/TMJA2019.shp')

In [328]:
df_shp = df_shp.set_index(['route'])

In [329]:
def transform_geometry(
        df: pd.DataFrame
        )-> pd.DataFrame:
    """
    Transform the geometry column Linestrings currently in Lambert-93 projection
    into the corresponding latitude and longitude.

    Parameters:
    -----------
    df : pandas.DataFrame
        The dataframe containing the geopandas geometry column

    Returns:
    --------
    pandas.DataFrame
        The original dataframe with the new geometry column.
    """
    # Define the input and output projections
    in_proj = pyproj.Proj(init='epsg:2154')  # Lambert-93
    out_proj = pyproj.Proj(init='epsg:4326')  # WGS84
    
    df['Coordinate_transform'] = df['Coordinates'].apply(lambda x: pyproj.transform(out_proj, in_proj, float(x.split(',')[0]),float(x.split(',')[1])))
    

    return df

In [335]:
def intersection_road_station(
        df_line: pd.DataFrame, 
        station_coordinates: str, 
        route_index: int,
        ) -> str:
    line = df_line['geometry'].iloc[route_index]

    point = Point(
        round(station_coordinates[0]),
        round(station_coordinates[1]))

    # Check if the station point is on the route LineString
    #is_on_line = line.intersects(point)
    EPS = 1
    is_on_line = line.buffer(EPS).contains(point)
    
    if is_on_line:
        return df_line.index[route_index]
    else:
        return ''


In [331]:
# select only routes with LineString geometries
df_line = df_shp[df_shp['geometry'].geom_type == 'LineString']

# transform the station coordinates in the correct form 
df_stations_t = transform_geometry(df_stations)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [306]:
#test function for one station
station_coord = df_stations_t['Coordinate_transform'][0]
results = [intersection_road_station(df_line,station_coord,route_id) for route_id in range(df_line.shape[0])]

In [342]:
def create_list_routes(
        station_coord: tuple,
        df_line: pd.DataFrame
):
        list_routes =  [intersection_road_station(df_line,station_coord,route) for route in range(df_line.shape[0])]

        return list_routes

In [343]:
def create_col_routes(
        df_stations_t: pd.DataFrame,
        df_line: pd.DataFrame
):
    # Crate the column Routes_on with a list of the route the station is on
    df_stations_t['Routes_on'] = df_stations_t['Coordinate_transform'].apply(lambda station_coord: create_list_routes(station_coord,df_line))
    # Join the list to check if there are any non-empty routes
    df_stations_t['Count_routes_on'] = df_stations_t['Routes_on'].apply(lambda my_list: ''.join(my_list))

    return df_stations_t

In [344]:
df_stations_t = create_col_routes(df_stations_t, df_line)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [346]:
df_stations_t[df_stations_t['Count_routes_on']!='']

Unnamed: 0,URL,Station de service,Ville,Adresse,Coordinates,Energies,Lavage,Paiement,Produits & Services,Services,Club TotalEnergies,Truck,H2 Conversion,Latitude,Longitude,Routes_on,Count_routes_on,Coordinate_transform
