In [None]:
import os
import numpy as np
import pandas as pd
from scipy import stats
from datetime import datetime
import matplotlib.pyplot as plt

In [None]:
# Function to load and process files based on the given conditions
def process_files(file_list, column_order, rename_dict, drop_column=None):
    dfs = []
    for file in file_list:
        df = pd.read_csv(file, delimiter=';', on_bad_lines='skip', dtype=str)
        if "Accident_Id" in df.columns:
            df.rename(columns={"Accident_Id": "Num_Acc"}, inplace=True)
        if drop_column and drop_column in df.columns:
            df.drop(columns=[drop_column], inplace=True)
        df = df.reindex(columns=column_order)
        dfs.append(df)
    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df.rename(columns=rename_dict, inplace=True)
    return combined_df

In [None]:
# Define file paths
source_folder = "source/"
caracteristiques_files = [os.path.join(source_folder, f) for f in os.listdir(source_folder) if f.startswith("caracteristiques")]
lieux_files = [os.path.join(source_folder, f) for f in os.listdir(source_folder) if f.startswith("lieux")]
usagers_files = [os.path.join(source_folder, f) for f in os.listdir(source_folder) if f.startswith("usagers")]
vehicules_files = [os.path.join(source_folder, f) for f in os.listdir(source_folder) if f.startswith("vehicules")]

In [None]:
# Define column orders and rename dictionaries
caracteristiques_columns = ["Num_Acc", "jour", "mois", "an", "hrmn", "lum", "dep", "com", "agg", "int", "atm", "col", "adr", "lat", "long"]
caracteristiques_rename = {"Num_Acc": "AccID", "jour": "day", "mois": "month", "an": "year", "hrmn": "time", "lum": "lum", 
                           "dep": "dep_code", "com": "com_code", "agg": "location", "int": "int", "atm": "atm_condition", 
                           "col": "collision_type", "adr": "address", "lat": "lat", "long": "long"}

lieux_columns = ["Num_Acc", "catr", "voie", "v1", "v2", "circ", "nbv", "vosp", "prof", "pr", "pr1", "plan", "lartpc", 
                 "larrout", "surf", "infra", "situ", "vma"]
lieux_rename = {"Num_Acc": "AccID", "catr": "route_category", "voie": "route_number", "v1": "route_number_index1", 
                "v2": "alph_route_index", "circ": "traffic_regime", "nbv": "total_number_lanes", "vosp": "reserved_lane_code", 
                "prof": "longitudinal_profile", "pr": "upstream_terminal_number", "pr1": "distance_upstream_terminal", 
                "plan": "plan", "lartpc": "width_central_reservation", "larrout": "width_roadway", "surf": "surface_condition", 
                "infra": "infra", "situ": "accident_situation", "vma": "maximum_speed"}

usagers_columns = ["Num_Acc", "id_vehicule", "num_veh", "place", "catu", "grav", "sexe", "an_nais", "trajet", "secu1", 
                   "secu2", "secu3", "locp", "actp", "etatp"]
usagers_rename = {"Num_Acc": "AccID", "id_vehicule": "vehicleID", "num_veh": "num_veh", "place": "seat", "catu": "user_category", 
                  "grav": "gravity", "sexe": "gender", "an_nais": "birth_year", "trajet": "reason_travel", "secu1": "safety_equipment1", 
                  "secu2": "safety_equipment2", "secu3": "safety_equipment3", "locp": "pedestrian_location", "actp": "pedestrian_action", 
                  "etatp": "pedestrian_involved"}

vehicules_columns = ["Num_Acc", "id_vehicule", "num_veh", "senc", "catv", "obs", "obsm", "choc", "manv", "motor", "occutc"]
vehicules_rename = {"Num_Acc": "AccID", "id_vehicule": "vehicleID", "num_veh": "num_veh", "senc": "traffic_direction", 
                    "catv": "vehicle_category", "obs": "fixed_obstacle", "obsm": "mobile_obstacle", "choc": "initial_impact_point", 
                    "manv": "manv", "motor": "motor", "occutc": "number_occupants_publictransport"}

In [None]:
# Process each group of files
characteristics = process_files(caracteristiques_files, caracteristiques_columns, caracteristiques_rename)
locations = process_files(lieux_files, lieux_columns, lieux_rename)
users = process_files(usagers_files, usagers_columns, usagers_rename, drop_column="id_usager")
vehicles = process_files(vehicules_files, vehicules_columns, vehicules_rename)

In [None]:
# Replace -1 to NaN
users['reason_travel'] = users['reason_travel'].replace(' -1', '0') # '-1 - Not specified' to '0 - Unknown'
characteristics.replace(' -1', np.nan, inplace=True)
locations.replace(' -1', np.nan, inplace=True)
users.replace(' -1', np.nan, inplace=True)
vehicles.replace(' -1', np.nan, inplace=True)

In [None]:
# Convert 'day', 'month', and 'year' fields to integer in the 'characteristics' dataframe
characteristics['day'] = pd.to_numeric(characteristics['day'], errors='coerce').astype(pd.Int64Dtype())
characteristics['month'] = pd.to_numeric(characteristics['month'], errors='coerce').astype(pd.Int64Dtype())
characteristics['year'] = pd.to_numeric(characteristics['year'], errors='coerce').astype(pd.Int64Dtype())


In [None]:
# Replace invalid literals with NaN and convert specified columns in 'locations' dataframe
locations['total_number_lanes'] = pd.to_numeric(locations['total_number_lanes'].replace('#ERREUR', np.nan), errors='coerce').astype(pd.Int64Dtype())
locations['maximum_speed'] = pd.to_numeric(locations['maximum_speed'], errors='coerce').astype(pd.Int64Dtype())
locations['upstream_terminal_number'] = pd.to_numeric(locations['upstream_terminal_number'], errors='coerce')
locations['distance_upstream_terminal'] = pd.to_numeric(locations['distance_upstream_terminal'], errors='coerce')

In [None]:
# Replace invalid literals with NaN and convert specified columns in 'users' dataframe
users['birth_year'] = pd.to_numeric(users['birth_year'], errors='coerce')

In [None]:
# Convert specified columns to object type
characteristics['lat'] = characteristics['lat'].str.replace(',', '.').astype(float)
characteristics['long'] = characteristics['long'].str.replace(',', '.').astype(float)

<font size="6">  
    Merge Dataframes
</font> 

In [None]:
# Merge DataFrames on AccID
data = characteristics.merge(locations, on='AccID', how='inner') \
                           .merge(vehicles, on='AccID', how='inner') \
                           .merge(users, on=['AccID', 'vehicleID', 'num_veh'], how='inner')

In [None]:
#Calculate 'age'
data['age'] = data['year'] - data['birth_year']

In [None]:
# Delete duplicate rows
data = data.drop_duplicates()

In [None]:
#Drop all columns where the proportion of NaN values is >= 0.30
data = data.loc[:, data.isna().sum() / len(data) < 0.30]

In [None]:
print(data.isna().sum() / len(data))

In [None]:
# Function to replace NaNs based on existing distribution
def fill_na_with_distribution(df, column):
    # Calculate value counts for non-NaN values
    value_counts = df[column].value_counts(normalize=True)
    
    # Create a list of values based on the distribution
    values = value_counts.index.tolist()
    probabilities = value_counts.values.tolist()
    
    # Number of NaNs to fill
    nans_to_fill = df[column].isna().sum()
    
    # Randomly choose values based on the distribution
    fill_values = np.random.choice(values, size=nans_to_fill, p=probabilities)
    
    # Fill NaNs with these values
    df.loc[df[column].isna(), column] = fill_values

# Apply the function to each column with NaN values
for column in data.columns:
    if data[column].isna().sum() > 0:
        fill_na_with_distribution(data, column)

pd.set_option('display.max_columns', None)
data.head(5)

In [None]:
# IQR test
IQR_maximum_speed = data["maximum_speed"].quantile(0.75)-data["maximum_speed"].quantile(0.25)
# Lower bound
I1_maximum_speed = data["maximum_speed"].quantile(0.25) - 1.5 * IQR_maximum_speed

# Upper bound
I2_maximum_speed = data["maximum_speed"].quantile(0.75) + 1.5 * IQR_maximum_speed
print()
print("IQR :", IQR_maximum_speed, end="\n\n")
print("Range :[", I1_maximum_speed, ";", I2_maximum_speed, "]")
data.loc[(data['maximum_speed'] < 5) | (data['maximum_speed'] >125), 'maximum_speed']

In [None]:
# IQR test
IQR_age = data["age"].quantile(0.75)-data["age"].quantile(0.25)
# Lower bound
I1_age = data["age"].quantile(0.25) - 1.5 * IQR_age

# Upper bound
I2_age = data["age"].quantile(0.75) + 1.5 * IQR_age
print()
print("IQR :", IQR_age, end="\n\n")
print("Range :[", I1_age, ";", I2_age, "]")
data.loc[(data['age'] < 0) | (data['age'] >97), 'age']

In [None]:
# Ensure the 'dep_code' column is numeric
data["dep_code"] = pd.to_numeric(data["dep_code"], errors='coerce')

# Drop any rows where 'dep_code' could not be converted to a numeric value
data = data.dropna(subset=["dep_code"])

# Calculate the IQR
IQR_dep_code = data["dep_code"].quantile(0.75) - data["dep_code"].quantile(0.25)

# Lower bound
I1_dep_code = data["dep_code"].quantile(0.25) - 1.5 * IQR_dep_code

# Upper bound
I2_dep_code = data["dep_code"].quantile(0.75) + 1.5 * IQR_dep_code

# Identify outliers
outliers = data[(data["dep_code"] < I1_dep_code) | (data["dep_code"] > I2_dep_code)]["dep_code"].unique()

# Print the IQR and bounds
print("IQR:", IQR_dep_code, end="\n\n")
print("Range: [", I1_dep_code, ";", I2_dep_code, "]")

# Print unique outlier values
print("\nUnique outlier values:")
print(outliers)

In [None]:
# Remove outliers
data = data[~((data['maximum_speed'] < 5) | (data['maximum_speed'] > 125))]
data = data[~((data['age'] < 0) | (data['age'] > 93))]
data = data[~data['dep_code'].isin([988,976,974,972,973,987,986,971,977,978,975])]

In [None]:
# Dropping the specified columns
columns_to_exclude = ['dep_code', 'com_code', 'location', 'int', 'address', 'route_number_index1', 'route_number']
data = data.drop(columns=columns_to_exclude)


In [None]:
data['time'] = data['time'].apply(lambda x: (int(x.split(':')[0]) * 3600000) +
                                       (int(x.split(':')[1]) * 60000))

In [None]:
# Attempt to convert each column individually to identify which ones cause issues
problematic_columns = []

columns_to_convert = {
    'day': 'Int64',
    'month': 'Int64',
    'year': 'Int64',
    'time': 'Int64',
    'lum': 'Int64',
    'atm_condition': 'Int64',
    'collision_type': 'Int64',
    'lat': 'float64',
    'long': 'float64',
    'route_category': 'Int64',
    'traffic_regime': 'Int64',
    'total_number_lanes': 'Int64',
    'reserved_lane_code': 'Int64',
    'longitudinal_profile': 'Int64',
    'upstream_terminal_number': 'Int64',
    'distance_upstream_terminal': 'Int64',
    'plan': 'Int64',
    'surface_condition': 'Int64',
    'infra': 'Int64',
    'accident_situation': 'Int64',
    'maximum_speed': 'Int64',
    'traffic_direction': 'Int64',
    'vehicle_category': 'Int64',
    'fixed_obstacle': 'Int64',
    'mobile_obstacle': 'Int64',
    'initial_impact_point': 'Int64',
    'manv': 'Int64',
    'motor': 'Int64',
    'seat': 'Int64',
    'user_category': 'Int64',
    'gravity': 'Int64',
    'gender': 'Int64',
    'birth_year': 'Int64',
    'reason_travel': 'Int64',
    'safety_equipment1': 'Int64',
    'age': 'Int64'
}

for column, dtype in columns_to_convert.items():
    try:
        data[column].astype(dtype)
    except (ValueError, TypeError):
        problematic_columns.append(column)

problematic_columns


In [None]:
# Identifying categorical columns in the dataset
categorical_columns = data.select_dtypes(include=['object', 'category', 'int64']).columns.tolist()

# Removing 'gravity' from the list as it will be the dependent variable
categorical_columns.remove('gravity')

# Initialize a dictionary to store the results
chi_square_results_all = {}

# Perform Chi-square test between 'gravity' and each categorical variable
for column in categorical_columns:
    contingency_table = pd.crosstab(data['gravity'], data[column])
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
    chi_square_results_all[column] = {
        'Chi-square statistic': chi2,
        'p-value': p,
        'Degrees of freedom': dof,
        'Expected frequencies': expected
    }

# Display the results
chi_square_results_all

In [None]:
# Calculate Chi-square statistics for each categorical column
chi_square_results_all = {}
for column in categorical_columns:
    contingency_table = pd.crosstab(data['gravity'], data[column])
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
    chi_square_results_all[column] = {
        'Chi-square statistic': chi2,
        'p-value': p,
        'Degrees of freedom': dof,
        'Expected frequencies': expected
    }

# Extracting Chi-square statistics and p-values for visualization
variables = list(chi_square_results_all.keys())

# Exclude specific fields
exclude_fields = {'AccID', 'vehicleID', 'num_veh'}
filtered_variables = [var for var in variables if var not in exclude_fields]

# Extract the corresponding Chi-square statistics and p-values
filtered_chi_square_stats = [chi_square_results_all[var]['Chi-square statistic'] for var in filtered_variables]
filtered_p_values = [chi_square_results_all[var]['p-value'] for var in filtered_variables]

# Create a bar plot for Chi-square statistics
plt.figure(figsize=(12, 8))
plt.barh(filtered_variables, filtered_chi_square_stats, color='skyblue')
plt.xlabel('Chi-square Statistic')
plt.title('Chi-square Statistics for Variables with Gravity')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.gca().invert_yaxis()  
plt.show()

# Create a bar plot for p-values (log scale)
plt.figure(figsize=(12, 8))
plt.barh(filtered_variables, filtered_p_values, color='salmon')
plt.xlabel('p-value (log scale)')
plt.title('p-values for Chi-square Tests (log scale) with Gravity')
plt.yscale('log') 
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.gca().invert_yaxis() 
plt.show()

In [None]:
data.shape[0]

In [None]:
data.info()

<font size="6">  
    Export Dataframes
</font>  

In [None]:
data.to_csv('data.csv', index=False)