In [1]:
import os
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:
# Function to load and process files based on the given conditions
def process_files(file_list, column_order, rename_dict, drop_column=None):
    dfs = []
    for file in file_list:
        df = pd.read_csv(file, delimiter=';', on_bad_lines='skip', dtype=str)
        if "Accident_Id" in df.columns:
            df.rename(columns={"Accident_Id": "Num_Acc"}, inplace=True)
        if drop_column and drop_column in df.columns:
            df.drop(columns=[drop_column], inplace=True)
        df = df.reindex(columns=column_order)
        dfs.append(df)
    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df.rename(columns=rename_dict, inplace=True)
    return combined_df

In [3]:
# Define file paths
source_folder = "source/"
caracteristiques_files = [os.path.join(source_folder, f) for f in os.listdir(source_folder) if f.startswith("caracteristiques")]
lieux_files = [os.path.join(source_folder, f) for f in os.listdir(source_folder) if f.startswith("lieux")]
usagers_files = [os.path.join(source_folder, f) for f in os.listdir(source_folder) if f.startswith("usagers")]
vehicules_files = [os.path.join(source_folder, f) for f in os.listdir(source_folder) if f.startswith("vehicules")]

In [4]:
# Define column orders and rename dictionaries
caracteristiques_columns = ["Num_Acc", "jour", "mois", "an", "hrmn", "lum", "dep", "com", "agg", "int", "atm", "col", "adr", "lat", "long"]
caracteristiques_rename = {"Num_Acc": "AccID", "jour": "day", "mois": "month", "an": "year", "hrmn": "time", "lum": "lum", 
                           "dep": "dep_code", "com": "com_code", "agg": "location", "int": "int", "atm": "atm_condition", 
                           "col": "collision_type", "adr": "address", "lat": "lat", "long": "long"}

lieux_columns = ["Num_Acc", "catr", "voie", "v1", "v2", "circ", "nbv", "vosp", "prof", "pr", "pr1", "plan", "lartpc", 
                 "larrout", "surf", "infra", "situ", "vma"]
lieux_rename = {"Num_Acc": "AccID", "catr": "route_category", "voie": "route_number", "v1": "route_number_index1", 
                "v2": "alph_route_index", "circ": "traffic_regime", "nbv": "total_number_lanes", "vosp": "reserved_lane_code", 
                "prof": "longitudinal_profile", "pr": "upstream_terminal_number", "pr1": "distance_upstream_terminal", 
                "plan": "plan", "lartpc": "width_central_reservation", "larrout": "width_roadway", "surf": "surface_condition", 
                "infra": "infra", "situ": "accident_situation", "vma": "maximum_speed"}

usagers_columns = ["Num_Acc", "id_vehicule", "num_veh", "place", "catu", "grav", "sexe", "an_nais", "trajet", "secu1", 
                   "secu2", "secu3", "locp", "actp", "etatp"]
usagers_rename = {"Num_Acc": "AccID", "id_vehicule": "vehicleID", "num_veh": "num_veh", "place": "seat", "catu": "user_category", 
                  "grav": "gravity", "sexe": "gender", "an_nais": "birth_year", "trajet": "reason_travel", "secu1": "safety_equipment1", 
                  "secu2": "safety_equipment2", "secu3": "safety_equipment3", "locp": "pedestrian_location", "actp": "pedestrian_action", 
                  "etatp": "pedestrian_involved"}

vehicules_columns = ["Num_Acc", "id_vehicule", "num_veh", "senc", "catv", "obs", "obsm", "choc", "manv", "motor", "occutc"]
vehicules_rename = {"Num_Acc": "AccID", "id_vehicule": "vehicleID", "num_veh": "num_veh", "senc": "traffic_direction", 
                    "catv": "vehicle_category", "obs": "fixed_obstacle", "obsm": "mobile_obstacle", "choc": "initial_impact_point", 
                    "manv": "manv", "motor": "motor", "occutc": "number_occupants_publictransport"}

In [5]:
# Process each group of files
characteristics = process_files(caracteristiques_files, caracteristiques_columns, caracteristiques_rename)
locations = process_files(lieux_files, lieux_columns, lieux_rename)
users = process_files(usagers_files, usagers_columns, usagers_rename, drop_column="id_usager")
vehicles = process_files(vehicules_files, vehicules_columns, vehicules_rename)

In [6]:
# Convert 'lat' and 'long' fields to float in the 'characteristics' dataframe
characteristics['lat'] = characteristics['lat'].str.replace(',', '.').astype(float)
characteristics['long'] = characteristics['long'].str.replace(',', '.').astype(float)

In [7]:
# Convert 'time' field to proper time format in the 'characteristics' dataframe
characteristics['time'] = pd.to_datetime(characteristics['time'], format='%H:%M').dt.time

In [8]:
# Convert 'day', 'month', and 'year' fields to integer in the 'characteristics' dataframe
characteristics['day'] = pd.to_numeric(characteristics['day'], errors='coerce').astype(pd.Int32Dtype())
characteristics['month'] = pd.to_numeric(characteristics['month'], errors='coerce').astype(pd.Int32Dtype())
characteristics['year'] = pd.to_numeric(characteristics['year'], errors='coerce').astype(pd.Int32Dtype())

In [9]:
# Replace invalid literals with NaN and convert specified columns in 'locations' dataframe
locations['total_number_lanes'] = pd.to_numeric(locations['total_number_lanes'].replace('#ERREUR', np.nan), errors='coerce').astype(pd.Int32Dtype())
locations['distance_upstream_terminal'] = pd.to_numeric(locations['distance_upstream_terminal'].str.replace(',', '.').replace('#ERREUR', np.nan), errors='coerce')
locations['width_central_reservation'] = pd.to_numeric(locations['width_central_reservation'].str.replace(',', '.').replace('#ERREUR', np.nan), errors='coerce')
locations['width_roadway'] = pd.to_numeric(locations['width_roadway'].str.replace(',', '.').replace('#ERREUR', np.nan), errors='coerce')
locations['maximum_speed'] = pd.to_numeric(locations['maximum_speed'].replace('#ERREUR', np.nan), errors='coerce').astype(pd.Int32Dtype())

In [10]:
#Replace '#ERREUR' with NaN
users['birth_year'] = users['birth_year'].replace('#ERREUR', np.nan)

#Convert 'birth_year' to numeric with coercion
users['birth_year'] = pd.to_numeric(users['birth_year'], errors='coerce')

#Calculate the average birth year, ignoring NaN values
average_birth_year = int(users['birth_year'].mean())

# Replace NaN values in 'birth_year' with the average
users['birth_year'] = users['birth_year'].fillna(average_birth_year).astype(int)

#Calculate 'age'
current_year = datetime.now().year
users['age'] = current_year - users['birth_year']

In [11]:
# Convert 'number_occupants_publictransport' field to integer in the 'vehicles' dataframe
vehicles['number_occupants_publictransport'] = pd.to_numeric(
    vehicles['number_occupants_publictransport'].replace('#ERREUR', np.nan), errors='coerce').astype(pd.Int32Dtype())

In [13]:
# Drop fields with a large number of missing values and outliers
vehicles = vehicles.drop(['number_occupants_publictransport'], axis=1)
locations = locations.drop(['alph_route_index', 'distance_upstream_terminal', 'width_central_reservation', 'width_roadway', 'route_number', 'route_number_index1'], axis=1)
locations = locations[(locations['maximum_speed'] >= 6) & (locations['maximum_speed'] <= 120)]
users = users[users['birth_year'] > 1930]
users = users[users['gravity'] != ' -1']



<font size="6">  
    Merge Dataframes
</font> 

In [14]:
# Merge dataframes on AccID
data = characteristics.merge(locations, on='AccID').merge(users, on='AccID').merge(vehicles, on='AccID')

<font size="6">  
    Export Dataframes
</font>  

In [15]:
data.to_csv('data.csv', index=False)