In [51]:
from __future__ import annotations
import pandas as pd
import numpy as np
import os
import sys
sys.path.append("../src/")

from utils import *
from sklearn.impute import KNNImputer
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

import geopandas as gpd

In [None]:
#appel d'une fonction dans utils pour ouvrir le dataset des accidents et le formatter
df_france = create_dataframe_france()

In [None]:
df_france.head()

In [None]:
df_amenagement = gpd.read_file("../data/france-20230101.geojson")

In [None]:
df_amenagement.loc[df_amenagement["ame_g"]=="AUCUN"].head()

In [None]:
df_amenagement = preprocessing(df_amenagement)

In [None]:
df_amenagement["code_com"] = df_amenagement["code_com"].astype("string")
df_amenagement = df_amenagement.loc[df_amenagement["code_com"].str[:2] == "75"]
df_amenagement

In [None]:
df_amenagement.reset_index(inplace = True)
df_amenagement

In [None]:
np.array(df_paris.index)

In [None]:
df_amenagement["date_maj"] = pd.to_datetime(df_amenagement["date_maj"])

In [None]:
infrastructure = df_amenagement[['latitude_dep', 'longitude_dep']].to_numpy()
accidents = df_paris[['lat', 'long']].to_numpy()

In [None]:
import numpy as np
from math import sin, cos, sqrt, atan2, radians, asin
from tqdm import tqdm
import warnings

warnings.simplefilter('ignore')

def distance(lat_acc, lon_acc, lat_fin, lon_fin, lat_dep, lon_dep):
    R = 6371
    lat_dep, lon_dep, lat_acc, lon_acc = map(np.radians, [lat_dep, lon_dep, lat_acc, lon_acc])
    
    dlat = lat_acc - lat_dep
    dlon = lon_acc - lon_dep
    
    a = np.sin(dlat / 2)**2 + np.cos(lat_dep) * np.cos(lat_acc) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance1 = R * c
    
    lat_fin, lon_fin = map(np.radians, [lat_fin, lon_fin])
    
    dlat = lat_acc - lat_fin
    dlon = lon_acc - lon_fin
    
    a = np.sin(dlat / 2)**2 + np.cos(lat_fin) * np.cos(lat_acc) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance2 = R * c

    if(distance1.min()>distance2.min()):
        return distance2
    else:
        return distance1

def link_dataframes(accidents, infrastructure, radius):
    # On ajoute une colonne distance qui sera utilisé pour la jointure
    accidents['distance'] = np.nan
    accidents['Index_in_accident'] = np.nan
    for i, row in tqdm(accidents.iterrows(), total=accidents.shape[0]):
        lat1, lon1 = row['lat'], row['long']
        
        infrastructure['distance'] = distance(lat1, lon1,
                                              infrastructure['latitude_fin'],
                                              infrastructure['longitude_fin'],
                                              infrastructure['latitude_dep'],
                                              infrastructure['longitude_dep'])
        distance_series = infrastructure.query("distance <= @radius").distance
        accidents.at[i, 'distance'] = distance_series.min()*1000
        accidents.at[i, 'Index_in_accident'] = distance_series.argmin()
    
    infrastructure.drop(["index", "distance"], inplace=True, axis=1)
    accidents.drop(["index"], inplace=True, axis=1)
    
    return accidents

#On utilise les dataframes accidents et infrastructure
infrastructure_with_acc = link_dataframes(df_paris, df_amenagement, 500)

In [None]:
infrastructure_with_acc

In [None]:
infrastructure_with_acc.loc[infrastructure_with_acc['distance']<50]['distance'].describe()

In [None]:
infrastructure_with_acc['Index_in_accident'] = infrastructure_with_acc['Index_in_accident'].astype("Int32")

In [None]:
df_amenagement['Index_in_accident'] = df_amenagement.index
df_amenagement

In [None]:
infrastructure_with_acc = infrastructure_with_acc.merge(df_amenagement, left_on="Index_in_accident", right_on="Index_in_accident")


In [None]:
infrastructure_with_acc

In [None]:
infrastructure_with_acc = infrastructure_with_acc.loc[infrastructure_with_acc['distance']<50]

In [None]:
import folium

def make_map(df):
    # Create a map object
    m = folium.Map(location=[48.8, 2.3], zoom_start=12)

    # Create the layer control
#     layer_control = folium.map.LayerControl(position='topright', collapsed=False)

    # Iterate through the rows of your dataset
    for i, row in df.iterrows():
        # Calculate the size of the marker based on the gravity of the accident
        size = row['grav'] * 10
        # Add a marker for the latitude and longitude
        accident = folium.RegularPolygonMarker(location=[row['lat'], row['long']], 
                                    number_of_sides=3, 
                                    radius=size,
                                    color='red', 
                                    fill_color='red')
        accident.add_to(m)
        #layer_control.add_child(accident, name="Accidents (proportional to gravity)")
        # Add a marker for the latitude_dep and longitude_dep
        infra1 = folium.CircleMarker(location=[row['latitude_dep'], row['longitude_dep']], 
                            radius=5, 
                            color='blue', 
                            fill=True, 
                            fill_color='blue')
        infra1.add_to(m)
        #layer_control.add_child(infra, name="Infrastructure (Cycling)")
        infra2 = folium.CircleMarker(location=[row['latitude_fin'], row['longitude_fin']], 
                            radius=5, 
                            color='blue', 
                            fill=True, 
                            fill_color='blue')
        infra2.add_to(m)
        # Draw a line between the latitude_dep, longitude_dep and latitude_fin, longitude_fin
        folium.PolyLine([[row['latitude_dep'], row['longitude_dep']], 
                         [row['latitude_fin'], row['longitude_fin']]], 
                        color='green', 
                        weight=2.5, 
                        opacity=1).add_to(m)

    # Add the layer control to the map
    #layer_control.add_to(m)
    # Show the map
    m.save("accident_with_infra.html")

In [None]:
make_map(infrastructure_with_acc)

In [None]:
pd.set_option("display.max_columns", None)

In [None]:
# imp = KNNImputer(n_neighbors=1)
# series_date = df["date"]
# df = df.drop('date', axis=1)
# df = pd.DataFrame(imp.fit_transform(df), columns=df.columns)
# # df = pd.concat([df, series_date])
# df

In [None]:
# df = one_hot_encoder(df, ['lum', 'int', 'atm', 'col', 'catr', 'circ', 'vosp','prof', 'plan', 'surf', 'infra', 'situ', 'catv', 'obs', 'obsm', 'choc','grav', 'obs2', 'obsm2', 'choc2'])
# df

In [None]:
# df["date"] = series_date
# df

In [None]:
# print("After imputation")
# missing(df)

In [None]:
# pca = PCA()
# df = df.drop('date', axis=1)
# X_reduced = pca.fit_transform(scale(df))
# X_reduced.shape

In [None]:
# cumulative_explained_variance = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
# print(cumulative_explained_variance)

In [None]:
# plt.bar(range(1,len(pca.explained_variance_ratio_ )+1),pca.explained_variance_ratio_ )
# plt.ylabel('Percentage of explained variance')
# plt.xlabel('Number of components')
# plt.plot(range(1,len(pca.explained_variance_ratio_ )+1),
#          np.cumsum(pca.explained_variance_ratio_ ),
#          c='red',
#          label="Cumulative Explained Variance")
# plt.legend(loc='upper left')

In [None]:
# inertie = []
# from tqdm import tqdm

# for i in tqdm(range(1, 300, 25)):
#     kmeans_pca = KMeans(n_clusters = i, init = 'k-means++', random_state=42)
#     kmeans_pca.fit(scale(df))
#     inertie.append(kmeans_pca.inertia_)

In [None]:
# plt.figure(figsize = (10, 8))
# plt.plot(range(1,300, 25), inertie, marker = 'o', linestyle = '--')
# plt.xlabel('K-means with pca clustering with k cluster')
# plt.ylabel('Inertie')
# plt.show()

### Merge the infra_with_good_index and accident_with_infra_index

In [69]:
infra_with_good_index = pd.read_csv("../data/infra_with_good_index.csv")
accident_with_infra_index = pd.read_csv("../data/accident_with_infra_index.csv")

In [70]:
infra_with_good_index = infra_with_good_index[["Unnamed: 0", "code_com_d", "ame_d", "regime_d","date_maj", "latitude_dep", "longitude_dep", "latitude_fin", "longitude_fin"]]

In [71]:
infra_with_good_index = infra_with_good_index.rename(columns={"Unnamed: 0":"index"})


In [72]:
infrastructure_with_acc = accident_with_infra_index.merge(infra_with_good_index, left_on="infra_index", right_on="index")

In [73]:
infrastructure_with_acc.drop(columns=["Unnamed: 0"], axis=1, inplace=True)

In [74]:
imp = KNNImputer(n_neighbors=1)
series_to_not_impute = infrastructure_with_acc[["date", "adr", "ame_d", "date_maj"]]
      

infrastructure_with_acc = infrastructure_with_acc.drop(columns=['date', 'adr', 'regime_d', 'ame_d', 'date_maj'], axis=1)
df = pd.DataFrame(imp.fit_transform(infrastructure_with_acc), columns=infrastructure_with_acc.columns)
# df = pd.concat([df, series_to_not_impute])
df

Unnamed: 0,grav,senc,catv,obs,obsm,choc,catr,circ,nbv,vosp,prof,plan,surf,infra,situ,lum,int,atm,col,com,lat,long,dep,obs2,obsm2,choc2,is_in_agg,distance_to_infra,infra_index,index,code_com_d,latitude_dep,longitude_dep,latitude_fin,longitude_fin
0,2.0,1.0,3.0,10.0,0.0,1.0,3.0,2.0,2.0,2.0,1.0,1.0,1.0,0.0,5.0,1.0,2.0,1.0,7.0,31.0,42.897100,3.532100,650.0,0.0,2.0,7.0,1.0,17.675395,6457.0,6457.0,67.0,49.049786,7.781237,49.049770,7.781121
1,1.0,2.0,3.0,0.0,2.0,1.0,4.0,2.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,44.0,48.609240,7.744640,670.0,0.0,2.0,1.0,1.0,299.316854,2826.0,2826.0,34.0,43.413679,3.693792,43.413715,3.693961
2,1.0,1.0,3.0,0.0,2.0,1.0,4.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,44.0,48.612790,7.739060,670.0,0.0,2.0,1.0,1.0,180.438825,2826.0,2826.0,34.0,43.413679,3.693792,43.413715,3.693961
3,1.0,1.0,3.0,0.0,2.0,1.0,4.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,3.0,44.0,48.609950,7.743510,670.0,0.0,2.0,1.0,1.0,203.135211,2826.0,2826.0,34.0,43.413679,3.693792,43.413715,3.693961
4,1.0,1.0,3.0,0.0,2.0,0.0,4.0,2.0,2.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,3.0,44.0,48.610080,7.747410,670.0,0.0,2.0,1.0,1.0,253.137683,2826.0,2826.0,34.0,43.413679,3.693792,43.413715,3.693961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6354,1.0,0.0,3.0,0.0,1.0,0.0,4.0,1.0,3.0,3.0,1.0,1.0,1.0,0.0,1.0,3.0,1.0,1.0,6.0,69.0,45.753441,4.841344,69.0,0.0,2.0,1.0,1.0,0.257348,8480.0,8480.0,31.0,43.715379,1.390515,43.715321,1.390613
6355,1.0,2.0,3.0,0.0,1.0,1.0,4.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,5.0,3.0,0.0,1.0,6.0,13.0,43.265940,5.395340,13.0,0.0,2.0,1.0,1.0,5.399302,2656.0,2656.0,75.0,48.877443,2.407383,48.877302,2.406962
6356,1.0,1.0,1.0,0.0,0.0,1.0,4.0,1.0,1.0,0.0,1.0,1.0,9.0,0.0,1.0,3.0,0.0,1.0,7.0,75.0,48.862396,2.355522,75.0,0.0,2.0,8.0,1.0,0.205524,3362.0,3362.0,59.0,50.729968,2.747864,50.730064,2.747775
6357,1.0,2.0,3.0,0.0,2.0,3.0,4.0,2.0,2.0,0.0,1.0,1.0,2.0,0.0,1.0,3.0,2.0,2.0,3.0,75.0,48.881683,2.381055,75.0,0.0,2.0,2.0,1.0,2.039181,5853.0,5853.0,67.0,48.392884,7.624270,48.392915,7.624117


In [75]:
df["date"] = series_to_not_impute["date"]
df["date_maj"] = series_to_not_impute["date_maj"]
df["ame_d"] = series_to_not_impute["ame_d"]
df["adr"] = series_to_not_impute["adr"]

In [77]:
df["date"]= pd.to_datetime(df["date"])

In [84]:
import plotly.graph_objects as go

# Filtrage des données pour les 5 dernières années
df = df[df['date'].between('2010-01-01','2020-12-31')]

# Extraction de l'année de la colonne 'date'
df['year'] = df['date'].dt.year

# Agrégation des données par année et gravité
df_agg = df.groupby(['year', 'grav'])['grav'].count().reset_index(name='count')

# Définition des couleurs pour chaque gravité
colors = {1: 'green', 2: 'orange', 3: 'red'}

# Création des trace pour chaque gravité
data = []
for grav in df_agg['grav'].unique():
    df_grav = df_agg[df_agg['grav'] == grav]
    trace = go.Scatter(x=df_grav['year'], y=df_grav['count'], name=f'Gravité {grav}',
                       mode='lines+markers', marker=dict(color=colors[grav]),
                       fill='tozeroy',fillcolor=colors[grav])
    data.append(trace)

# Mise en place de la figure
fig = go.Figure(data=data)
fig.update_layout(title='Evolution du nombre d\'accidents en France sur les 5 dernières années',
                  xaxis_title='Année', yaxis_title='Nombre d\'accidents')

# Affichage de la figure
fig.show()

#rajouter evolution nombre de cycliste

In [88]:
import plotly.express as px

# Mise en place de la figure
fig = px.density_mapbox(df, lat='latitude_dep', lon='longitude_dep', z='dep',
                        color_continuous_scale='YlGnBu',
                        mapbox_style='carto-positron',
                        labels={'count':'Nombre d\'accidents'},
                        title='Répartition des accidents par commune')

# Affichage de la figure
fig.show()


In [60]:
df.drop(columns=["adr"], axis=1, inplace=True)

### DATA VIZ