In [23]:
import geopandas as gpd
import pandas as pd
import json
import requests
import os
from enum import Enum
from tqdm.notebook import tqdm
import pprint
import shutil

In [152]:
class AccidentData:
    def __init__(self):
        self.data_dir = "..\data"
        self.accident_corporels_urls_filename = "accident_corporels_urls.json"
        self.categories = ["usagers", "vehicules", "lieux", "caracteristiques"]
        self.read_urls()
        self.years = list(self.urls["usagers"].keys())
        self.check_and_control_data()
    
    def read_urls(self):
        with open(os.path.join(self.data_dir, self.accident_corporels_urls_filename),"r") as file:
            self.urls = json.load(file)
    
    def download_data(self):
        if len(self.filenames) > 0:
            print("[Check] Checking completed, some data is missing!")
            print("[Download] Downloading missing data...")
            for filename in self.filenames.keys():
                filename_dir = os.path.join(self.data_dir,filename)
                print(f"\n[Download] Downloading {filename} files...")
                for i, year in enumerate(self.filenames[filename]):
                    data = requests.get(self.urls[filename][year]).text
                    if not os.path.exists(filename_dir):
                        os.makedirs(filename_dir)
                    with open(os.path.join(filename_dir,f"{year}.csv"), 'w', encoding='utf-8') as f:
                        f.write(data)
                    bar_length = int(50 * (i+1) / len(self.filenames[filename]))
                    bar = "#" * bar_length + "-" * (50 - bar_length)
                    print(f"{i+1}/{len(self.filenames[filename])} [{bar}]", end='\r')
            print("\n[Download] Download completed!")
        else:
            print("[Check] Checking completed, no data is missing!")

    def check_missing_data(self):
        print("[Check] Checking if data is in your computer...")
        self.filenames = {}
        for categorie in self.categories:
            filename_path = os.path.join(self.data_dir,categorie)
            if not os.path.exists(filename_path):
                self.filenames[categorie] = self.years
            else:
                for year in self.years:
                    if not os.path.exists(os.path.join(filename_path, f"{year}.csv")):
                        if categorie in self.filenames:
                            self.filenames[categorie].append(year)
                        else:
                            self.filenames[categorie] = [year]
                            
    def check_and_control_data(self):
        self.check_missing_data()
        self.download_data()
    
    def reset_db(self):
        print("[Reset] Reseting data...")
        for categorie in self.categories:
            categorie_path = os.path.join(self.data_dir, categorie)
            if os.path.exists(categorie_path):
                shutil.rmtree(categorie_path, ignore_errors=True) 
        print("[Reset] Data have been deleted")
        self.check_missing_data()
        print("[Reset] Data have been reset")
        
    def get_pd_file_from_year(self, cat, begin, end=None, merge=True):
        if cat.lower() in self.categories:
            cat_path = os.path.join(self.data_dir, cat)
            if end == None:
                if str(begin) in self.years:
                    try:
                        return pd.read_csv(os.path.join(cat_path, f"{begin}.csv"), sep=";")
                    except:
                        return pd.read_csv(os.path.join(cat_path, f"{begin}.csv"), sep=",")
            else:
                if str(begin) in self.years and str(end) in self.years:
                    list_df = []
                    for annee in range(begin,end+1):
                        try:
                            list_df.append(pd.read_csv(os.path.join(cat_path, f"{str(annee)}.csv"), sep=";"))
                        except:
                            list_df.append(pd.read_csv(os.path.join(cat_path, f"{str(annee)}.csv"), sep=","))
                    if merge:
                        return pd.concat(list_df)
                    else:
                        return list_df
                    
            return df
        else:
            print(f"{cat} not exists")

In [153]:
acc_data = AccidentData()

[Check] Checking if data is in your computer...
[Check] Checking completed, no data is missing!


In [151]:
acc_data.get_pd_file_from_year("lieux",2019)

..\data\lieux\2019.csv


Unnamed: 0,Num_Acc,catr,voie,v1,v2,circ,nbv,vosp,prof,pr,pr1,plan,lartpc,larrout,surf,infra,situ,vma
0,201900000001,1,3,0.0,,3,10,0,1,6,900,2,,,1,2,1,70
1,201900000002,1,1,0.0,,1,2,0,4,3,845,2,,,1,0,1,70
2,201900000003,1,86,0.0,,3,8,0,1,10,500,3,,,1,0,1,90
3,201900000004,1,4,0.0,,3,5,0,1,2,299,1,,,1,0,1,90
4,201900000005,1,86,0.0,INT,1,3,0,1,41,0,3,,,1,2,1,90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58835,201900058836,1,43,0.0,,1,3,0,1,15,500,1,,,2,3,1,130
58836,201900058837,1,35,0.0,,1,2,0,1,303,282,1,,,2,0,1,90
58837,201900058838,1,6,0.0,A,3,3,0,1,5,199,1,,,1,0,1,90
58838,201900058839,1,86,0.0,,1,1,0,1,59,99,3,,,1,0,1,50


In [108]:
usagers = pd.read_csv('/Users/Pierr/Downloads/usagers-2021.csv', sep=";")
vehicules = pd.read_csv('/Users/Pierr/Downloads/vehicules-2021.csv', sep=";")

In [110]:
usagers.head()

Unnamed: 0,Num_Acc,id_vehicule,num_veh,place,catu,grav,sexe,an_nais,trajet,secu1,secu2,secu3,locp,actp,etatp
0,202100000001,201 764,B01,1,1,3,1,2000.0,1,0,9,-1,0,0,-1
1,202100000001,201 765,A01,1,1,1,1,1978.0,1,1,-1,-1,0,0,-1
2,202100000002,201 762,A01,1,1,4,1,1983.0,0,1,-1,-1,0,0,-1
3,202100000002,201 763,B01,1,1,3,1,1993.0,0,1,-1,-1,0,0,-1
4,202100000003,201 761,A01,1,1,1,1,1995.0,1,1,0,-1,0,0,-1


In [111]:
#to drop : id_vehicule, num_vehicule, place, secu1, secu2, secu3
usagers = usagers.drop(columns=['id_vehicule', 'num_veh', 'place', 'secu1', 'secu2', 'secu3'])

In [112]:
vehicules.head()

Unnamed: 0,Num_Acc,id_vehicule,num_veh,senc,catv,obs,obsm,choc,manv,motor,occutc
0,202100000001,201 764,B01,1,1,0,2,1,1,5,
1,202100000001,201 765,A01,1,7,0,9,3,17,1,
2,202100000002,201 762,A01,0,7,2,2,1,1,0,
3,202100000002,201 763,B01,0,7,0,2,1,9,0,
4,202100000003,201 761,A01,1,7,0,1,3,1,1,


In [None]:
vehicules = vehicules.drop(columns=['id_vehicule', 'num_veh','senc', 'manv', 'motor','occutc'])

In [137]:
df = usagers.merge(vehicules,on=['Num_Acc'])
df.head()

Unnamed: 0,Num_Acc,catu,grav,sexe,an_nais,trajet,locp,actp,etatp,catv,obs,obsm,choc
0,202100000001,1,3,1,2000.0,1,0,0,-1,1,0,2,1
1,202100000001,1,3,1,2000.0,1,0,0,-1,7,0,9,3
2,202100000001,1,1,1,1978.0,1,0,0,-1,1,0,2,1
3,202100000001,1,1,1,1978.0,1,0,0,-1,7,0,9,3
4,202100000002,1,4,1,1983.0,0,0,0,-1,7,2,2,1


In [80]:
#on ne garde dans le dataframe que les lignes implicant un accident entre un vélo et
#un autre usager ou catv == 1 représente les vélos
Num_Acc =  df[df["catv"] == 1]["Num_Acc"]
velo_only = df[df["Num_Acc"].isin(list(Num_Acc))]


In [82]:
velo_only.head()

Unnamed: 0,Num_Acc,id_vehicule,num_veh,senc,catv,obs,obsm,choc,manv,motor,occutc,catu,grav,sexe,an_nais,trajet,locp,actp,etatp
0,202100000001,201 764,B01,1,1,0,2,1,1,5,,1,3,1,2000.0,1,0,0,-1
1,202100000001,201 764,B01,1,1,0,2,1,1,5,,1,1,1,1978.0,1,0,0,-1
2,202100000001,201 765,A01,1,7,0,9,3,17,1,,1,3,1,2000.0,1,0,0,-1
3,202100000001,201 765,A01,1,7,0,9,3,17,1,,1,1,1,1978.0,1,0,0,-1
18,202100000006,201 752,B01,2,1,0,0,5,1,5,,1,4,1,2009.0,2,0,0,-1


In [51]:
usagers_velo.describe()

Unnamed: 0,Num_Acc,catu,grav,sexe,an_nais,trajet,locp,etatp
count,10651.0,10651.0,10651.0,10651.0,10255.0,10651.0,10651.0,10651.0
mean,202100000000.0,1.092667,2.451882,1.191719,1978.862311,3.356305,-0.375645,-0.929678
std,15359.96,0.38144,1.394021,0.614879,19.424436,2.862299,0.890958,0.410236
min,202100000000.0,1.0,-1.0,-1.0,1924.0,-1.0,-1.0,-1.0
25%,202100000000.0,1.0,1.0,1.0,1964.0,1.0,-1.0,-1.0
50%,202100000000.0,1.0,3.0,1.0,1981.0,4.0,0.0,-1.0
75%,202100000000.0,1.0,4.0,2.0,1995.0,5.0,0.0,-1.0
max,202100100000.0,3.0,4.0,2.0,2021.0,9.0,9.0,3.0
