In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OrdinalEncoder


In [3]:
data = pd.read_csv('source/data.csv')

In [5]:
# List of columns to convert to object type
columns_to_convert = [
    'AccID', 'accident_situation', 'atm_condition', 'collision_type', 'com_code', 'dep_code', 
    'fixed_obstacle', 'gender', 'gravity', 'infra', 'initial_impact_point', 'int', 'location', 
    'longitudinal_profile', 'lum', 'manv', 'mobile_obstacle', 'motor', 'num_veh_x', 'num_veh_y',
    'pedestrian_action', 'pedestrian_location', 'plan', 'reason_travel', 'reserved_lane_code', 
    'route_category', 'safety_equipment1', 'safety_equipment2', 'safety_equipment3', 'seat', 
    'surface_condition', 'time', 'traffic_direction', 'traffic_regime', 'upstream_terminal_number', 
    'user_category', 'vehicle_category', 'vehicleID_x', 'vehicleID_y'
]

In [7]:
# Convert specified columns to object type
data[columns_to_convert] = data[columns_to_convert].astype('object')

<font size="6">  
    Implementing feature engineering techniques for gravity
</font> 

In [10]:
# Ensure that the 'gravity' column exists in the dataset
if 'gravity' not in data.columns:
    data['gravity'] = [0] * len(data)  # Dummy column for demonstration

# Explicitly convert the target variable to integer type
data['gravity'] = data['gravity'].astype(int)

# Frequency Encoding for high cardinality columns
def frequency_encoding(df, col):
    freq_encoding = df[col].value_counts() / len(df)
    df[col + '_freq'] = df[col].map(freq_encoding)
    return df

# Apply frequency encoding to high cardinality categorical variables
high_cardinality_cols = ['location', 'com_code', 'vehicle_category']
for col in high_cardinality_cols:
    data = frequency_encoding(data, col)

# Drop the original high cardinality columns after encoding
data = data.drop(columns=high_cardinality_cols)

# Interaction Features
data['age_vehicle_interaction'] = data['age'] * data['vehicle_category_freq']
data['lat_long_interaction'] = data['lat'] * data['long']

# Polynomial Features for Age
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(data[['age']])
poly_df = pd.DataFrame(poly_features, columns=['age_poly', 'age_squared'])
data = pd.concat([data, poly_df], axis=1)

# Temporal Features
data['datetime'] = pd.to_datetime(data[['year', 'month', 'day']].astype(str).agg('-'.join, axis=1) + ' ' + data['time'])
data['hour'] = data['datetime'].dt.hour
data['day_of_week'] = data['datetime'].dt.dayofweek
data = data.drop(columns=['datetime'])

# Aggregation Features (before any encoding)
data['location_accidents'] = data.groupby('location_freq')['AccID'].transform('count')
data['mean_age_vehicle'] = data.groupby('vehicle_category_freq')['age'].transform('mean')

# Binning Age and Time
data['age_bin'] = pd.cut(data['age'], bins=[0, 18, 30, 50, 70, 100], labels=['0-18', '19-30', '31-50', '51-70', '70+'])
data['time_bin'] = pd.cut(data['hour'], bins=[0, 6, 12, 18, 24], labels=['Night', 'Morning', 'Afternoon', 'Evening'])

# One-Hot Encoding for low cardinality categorical variables
data = pd.get_dummies(data, columns=['age_bin', 'time_bin'], drop_first=True)

# Ordinal Encoding
ordinal_features = ['atm_condition', 'traffic_direction', 'fixed_obstacle']
encoder = OrdinalEncoder()
data[ordinal_features] = encoder.fit_transform(data[ordinal_features])

# Standard Scaling for numerical features
scaler = StandardScaler()
numerical_cols = ['age', 'lat', 'long', 'initial_impact_point', 'age_vehicle_interaction', 'lat_long_interaction']
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Log Transformation for age
data['log_age'] = np.log1p(data['age'])

# Ensure all features are numeric by excluding non-numeric columns
features = data.drop(columns=['gravity'])
features = features.select_dtypes(include=[np.number])

# Handle missing values using imputation
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Ensure the target variable is correctly processed
y = data['gravity']

# Check data types and unique values of the target variable
y_dtype = y.dtype, y.unique()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_imputed, y, test_size=0.3, random_state=42)

# Train a Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Extract feature importance
feature_importance = rf_model.feature_importances_

# Create a DataFrame for feature importance
feature_importance_df = pd.DataFrame({
    'Feature': features.columns,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

feature_importance_df.head(20), y_dtype


  result = getattr(ufunc, method)(*inputs, **kwargs)


(                    Feature  Importance
 15            com_code_freq    0.087679
 4                       lat    0.086605
 5                      long    0.079037
 18     lat_long_interaction    0.078858
 0                       day    0.067371
 21                     hour    0.060903
 1                     month    0.053106
 17  age_vehicle_interaction    0.044839
 22              day_of_week    0.042813
 13     initial_impact_point    0.036075
 20              age_squared    0.035644
 10                      age    0.035153
 19                 age_poly    0.034879
 8                birth_year    0.034811
 25                  log_age    0.031253
 2                      year    0.030485
 6        total_number_lanes    0.026815
 11        traffic_direction    0.025482
 7             maximum_speed    0.021542
 9       pedestrian_involved    0.019388,
 (dtype('int32'), array([4, 1, 3, 2])))

Top Features:
com_code_freq (0.087679): The frequency encoding of the commune code is the most important feature. This suggests that the location where the accident occurs has a significant impact on the severity (gravity) of the accident.
lat (0.086605) and long (0.079037): Latitude and longitude are also highly important, reinforcing the idea that geographic location plays a critical role.
lat_long_interaction (0.078858): The interaction between latitude and longitude is another strong indicator, likely capturing specific regional characteristics that affect accident severity.
Temporal Features:

day (0.067371): The day of the month is quite important, which might reflect patterns related to specific days or dates.
hour (0.060903): The hour of the day is also significant, indicating that the time of the accident influences its severity.
month (0.053106): The month of the year affects accident severity, potentially due to seasonal variations in weather, traffic, or other factors.
day_of_week (0.042813): The day of the week matters, possibly due to different traffic patterns or behaviors on weekdays versus weekends.
Age-Related Features:

age_vehicle_interaction (0.044839): The interaction between age and vehicle category is important, suggesting that the combination of a driver's age and their vehicle type affects accident outcomes.
age_squared (0.035644), age (0.035153), age_poly (0.034879), birth_year (0.034811), and log_age (0.031253): Various transformations of the age variable are significant, indicating that age-related factors are crucial in determining accident severity.
Initial Impact and Traffic Conditions:

initial_impact_point (0.036075): The point of initial impact in an accident is important, likely reflecting the type and severity of collisions.
traffic_direction (0.025482): The direction of traffic flow influences the severity, which could relate to the dynamics of head-on versus side-impact collisions.
total_number_lanes (0.026815) and maximum_speed (0.021542): Road infrastructure features like the number of lanes and maximum speed limits are important, indicating that road design impacts accident severity.
pedestrian_involved (0.019388): Whether a pedestrian is involved is also significant, highlighting the vulnerability of pedestrians in accidents.