In [1]:
# Import the libraries
# Base
import random
import warnings
import numpy as np
import pandas as pd

# Transformation
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

# Modelling
import statsmodels.formula.api as smf
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from xgboost import XGBClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA

# Re-Sampling
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Evaluation and metrics
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate, train_test_split
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Data Viz
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline




In [2]:
# Suppress specific future warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")
warnings.filterwarnings("ignore", category=FutureWarning, module="SMOTE")

In [3]:
# Load data
data = pd.read_csv('source/data.csv')

In [4]:
# Ensure that each column in the DataFrame is in the desired format

# Columns that should be of type object (string)
object_columns = ['AccID', 'vehicleID', 'num_veh']

# Columns that should be of type int64
int_columns = [
    'day', 'month', 'year', 'time', 'lum', 'atm_condition', 'collision_type',
    'route_category', 'traffic_regime', 'total_number_lanes', 'reserved_lane_code',
    'longitudinal_profile', 'plan', 'surface_condition', 'infra',
    'accident_situation', 'maximum_speed', 'traffic_direction', 'vehicle_category',
    'fixed_obstacle', 'mobile_obstacle', 'initial_impact_point', 'manv',
    'motor', 'seat', 'user_category', 'gravity', 'gender', 'birth_year',
    'reason_travel', 'safety_equipment1', 'age'
]

# Columns that should be of type float64
float_columns = [
    'lat', 'long', 'upstream_terminal_number', 'distance_upstream_terminal'
]

# Convert columns to the correct data types
data[object_columns] = data[object_columns].astype(str)
data[int_columns] = data[int_columns].astype(np.int64)
data[float_columns] = data[float_columns].astype(float)

# Check the data types to confirm the changes
data.dtypes


AccID                          object
day                             int64
month                           int64
year                            int64
time                            int64
lum                             int64
atm_condition                   int64
collision_type                  int64
lat                           float64
long                          float64
route_category                  int64
traffic_regime                  int64
total_number_lanes              int64
reserved_lane_code              int64
longitudinal_profile            int64
upstream_terminal_number      float64
distance_upstream_terminal    float64
plan                            int64
surface_condition               int64
infra                           int64
accident_situation              int64
maximum_speed                   int64
vehicleID                      object
num_veh                        object
traffic_direction               int64
vehicle_category                int64
fixed_obstac

In [5]:
# Selecting the columns to scale
features_to_scale = ['age', 'maximum_speed', 'distance_upstream_terminal', 'total_number_lanes', 'lat', 'long']

# Initialize the scaler 
scaler = StandardScaler()  

# Fit and transform the selected features
data[features_to_scale] = scaler.fit_transform(data[features_to_scale])

# Display the first few rows of the scaled features to confirm
data[features_to_scale].head()

Unnamed: 0,age,maximum_speed,distance_upstream_terminal,total_number_lanes,lat,long
0,-1.141512,0.509356,1.946753,5.385555,0.805698,-0.063104
1,-0.663829,0.509356,1.946753,5.385555,0.805698,-0.063104
2,1.140752,0.509356,1.946753,5.385555,0.805698,-0.063104
3,-0.716905,0.509356,1.782943,-0.340441,0.82129,-0.104276
4,-0.823057,1.4946,0.755404,3.954056,0.823628,-0.124441


In [6]:
data = data.drop(columns=['AccID', 'vehicleID', 'num_veh'])

In [7]:
# Define the feature set (X) and the target (y)
X = data.drop(columns=['gravity'])
y = data['gravity']

# Split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the resulting datasets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((358136, 35), (89534, 35), (358136,), (89534,))

In [8]:
# Fill missing values to avoid errors during model fitting
X_filled = X.fillna(X.median())

# Create and fit the Logistic Regression model
model = LogisticRegression(max_iter=1000, solver='liblinear')

# Fit the model on the entire dataset
model.fit(X_filled, y)

# Get the coefficients of the model
coefficients = model.coef_[0]

# Pair coefficients with their corresponding feature names
feature_coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': coefficients
}).sort_values(by='Coefficient', ascending=False)
feature_coefficients

Unnamed: 0,Feature,Coefficient
34,age,8.058087000000001e-17
24,mobile_obstacle,5.567554e-17
11,total_number_lanes,4.9581560000000005e-17
7,lat,4.0520130000000004e-17
8,long,-6.658544e-18
12,reserved_lane_code,-1.0282060000000001e-17
15,distance_upstream_terminal,-3.766615e-17
20,maximum_speed,-6.045071000000001e-17
18,infra,-8.594051e-17
26,manv,-9.239612e-17


In [9]:
# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X_filled.values, i) for i in range(len(X.columns))]

vif_data

Unnamed: 0,Feature,VIF
0,day,1.001065
1,month,1.009005
2,year,0.144106
3,time,7.915817
4,lum,1.066925
5,atm_condition,1.060705
6,collision_type,1.429426
7,lat,1.10834
8,long,1.074547
9,route_category,1.472496


In [10]:
# Fit Lasso model
lasso = Lasso(alpha=0.01)  # Adjust alpha as needed
lasso.fit(X_filled, y)

# Display non-zero coefficients (selected features)
selected_features = X.columns[lasso.coef_ != 0]
selected_features


Index(['day', 'month', 'year', 'time', 'lum', 'atm_condition',
       'collision_type', 'lat', 'route_category', 'reserved_lane_code',
       'upstream_terminal_number', 'plan', 'surface_condition', 'infra',
       'accident_situation', 'maximum_speed', 'vehicle_category',
       'fixed_obstacle', 'mobile_obstacle', 'manv', 'motor', 'seat',
       'user_category', 'gender', 'birth_year', 'reason_travel',
       'safety_equipment1'],
      dtype='object')

In [11]:
# Apply PCA to reduce dimensionality
pca = PCA(n_components=10)  # Set number of components as needed
X_pca = pca.fit_transform(X_filled)

# Use the transformed data for modeling
model.fit(X_pca, y)

In [12]:
# Compute correlation matrix
corr_matrix = X.corr().abs()

# Identify highly correlated features
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]

# Drop highly correlated features
X_reduced = X.drop(columns=to_drop)
X_reduced

Unnamed: 0,day,month,year,time,lum,atm_condition,collision_type,lat,long,route_category,...,fixed_obstacle,mobile_obstacle,initial_impact_point,manv,motor,seat,gender,birth_year,reason_travel,safety_equipment1
0,30,11,2019,5400000,4,1,2,0.805698,-0.063104,1,...,0,2,5,23,1,2,2,2002,0,1
1,30,11,2019,5400000,4,1,2,0.805698,-0.063104,1,...,0,2,5,23,1,1,2,1993,5,1
2,30,11,2019,5400000,4,1,2,0.805698,-0.063104,1,...,1,0,3,11,1,1,1,1959,0,1
3,30,11,2019,10200000,3,1,6,0.821290,-0.104276,1,...,4,0,1,0,1,1,2,1994,0,1
4,28,11,2019,54900000,1,1,4,0.823628,-0.124441,1,...,0,2,1,2,1,1,1,1996,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447665,1,1,2022,31200000,1,1,3,-1.440542,-0.288419,3,...,0,0,8,19,1,1,2,2002,5,1
447666,1,1,2022,31200000,1,1,3,-1.440542,-0.288419,3,...,0,0,8,19,1,8,2,2004,5,1
447667,1,1,2022,31200000,1,1,3,-1.440542,-0.288419,3,...,0,2,1,1,1,1,2,1953,5,1
447668,1,3,2022,60900000,1,1,2,0.217210,-0.516913,3,...,0,2,1,1,1,1,1,1992,1,2


In [13]:
# Fit the Ridge regression model
ridge = Ridge(alpha=1.0)  # Adjust alpha as needed
ridge.fit(X_filled, y)

# Create a DataFrame to display each feature and its corresponding coefficient
ridge_coefficients = pd.DataFrame({
    'Feature': X_filled.columns,   # X_filled.columns contains the feature names
    'Coefficient': ridge.coef_     # ridge.coef_ contains the corresponding coefficients
})

# Sort by the absolute value of coefficients to see the most important features first
ridge_coefficients = ridge_coefficients.sort_values(by='Coefficient', ascending=False)

# Display the DataFrame
print(ridge_coefficients)


                       Feature   Coefficient
29               user_category  3.663590e-01
27                       motor  2.078936e-01
30                      gender  1.962607e-01
20               maximum_speed  6.147104e-02
28                        seat  4.636660e-02
23              fixed_obstacle  4.389613e-02
16                        plan  4.196489e-02
33           safety_equipment1  2.977590e-02
4                          lum  2.254263e-02
17           surface_condition  2.035933e-02
22            vehicle_category  1.989387e-02
31                  birth_year  3.854751e-03
11          total_number_lanes  3.827628e-03
13        longitudinal_profile  3.601366e-03
25        initial_impact_point  1.296265e-03
3                         time -9.730372e-10
14    upstream_terminal_number -6.923457e-05
0                          day -2.620748e-04
8                         long -1.219850e-03
1                        month -1.349380e-03
10              traffic_regime -3.251758e-03
5         

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [14]:
import statsmodels.api as sm

# Add a constant to the feature set for the intercept
X_with_constant = sm.add_constant(X_filled)

# Fit the model using OLS (Ordinary Least Squares)
ols_model = sm.OLS(y, X_with_constant).fit()

# Print the model summary
ols_model.summary()


0,1,2,3
Dep. Variable:,gravity,R-squared:,0.149
Model:,OLS,Adj. R-squared:,0.149
Method:,Least Squares,F-statistic:,2244.0
Date:,"Wed, 04 Sep 2024",Prob (F-statistic):,0.0
Time:,16:21:19,Log-Likelihood:,-742110.0
No. Observations:,447670,AIC:,1484000.0
Df Residuals:,447634,BIC:,1485000.0
Df Model:,35,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,25.9176,3.411,7.599,0.000,19.233,32.603
day,-0.0003,0.000,-1.208,0.227,-0.001,0.000
month,-0.0013,0.001,-2.376,0.018,-0.002,-0.000
year,-0.0158,0.002,-8.773,0.000,-0.019,-0.012
time,-9.73e-10,9.82e-11,-9.907,0.000,-1.17e-09,-7.81e-10
lum,0.0225,0.001,17.098,0.000,0.020,0.025
atm_condition,-0.0057,0.001,-4.986,0.000,-0.008,-0.003
collision_type,-0.0547,0.001,-44.639,0.000,-0.057,-0.052
lat,-0.0102,0.002,-5.082,0.000,-0.014,-0.006

0,1,2,3
Omnibus:,2303219.172,Durbin-Watson:,2.108
Prob(Omnibus):,0.0,Jarque-Bera (JB):,31951.015
Skew:,0.008,Prob(JB):,0.0
Kurtosis:,1.691,Cond. No.,97700000000.0


In [15]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_filled)


In [16]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.01)
lasso.fit(X_filled, y)
selected_features = X_filled.columns[lasso.coef_ != 0]


In [17]:
from sklearn.ensemble import RandomForestRegressor
model_rf = RandomForestRegressor()
model_rf.fit(X_filled, y)
feature_importances = pd.DataFrame(model_rf.feature_importances_, index=X_filled.columns, columns=['Importance']).sort_values('Importance', ascending=False)


In [18]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_filled, y)


In [19]:
from xgboost import XGBRegressor
xgb_model = XGBRegressor()
xgb_model.fit(X_filled, y)


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0)
scores = cross_val_score(ridge, X_filled, y, cv=5, scoring='neg_mean_squared_error')
print(scores.mean())


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


-1.6140989277388116


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [21]:
from scipy import stats
z_scores = np.abs(stats.zscore(X_filled))
X_filtered = X_filled[(z_scores < 3).all(axis=1)]  # Filter out rows with z-scores > 3


In [22]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_filled)


In [23]:
X_scaled

array([[ 1.635754  ,  1.27308923, -1.31856076, ..., -1.17778001,
        -0.42596313, -1.14151193],
       [ 1.635754  ,  1.27308923, -1.31856076, ...,  0.6570552 ,
        -0.42596313, -0.66382879],
       [ 1.635754  ,  1.27308923, -1.31856076, ..., -1.17778001,
        -0.42596313,  1.14075195],
       ...,
       [-1.67843418, -1.70626763,  1.31357483, ...,  0.6570552 ,
        -0.42596313,  1.61843509],
       [-1.67843418, -1.11039626,  1.31357483, ..., -0.81081297,
         0.00495926, -0.45152517],
       [-1.67843418, -1.11039626,  1.31357483, ..., -1.17778001,
        -0.42596313, -0.87613241]])