In [1]:
# Import the libraries
# Base
import random
import warnings
import numpy as np
import pandas as pd

# Transformation
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

# Modelling
import statsmodels.formula.api as smf
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from xgboost import XGBClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA

# Re-Sampling
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Evaluation and metrics
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate, train_test_split
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Data Viz
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline




In [2]:
# Suppress specific future warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")
warnings.filterwarnings("ignore", category=FutureWarning, module="SMOTE")

In [3]:
# Load data
data = pd.read_csv('source/data.csv')

In [4]:
# Ensure that each column in the DataFrame is in the desired format

# Columns that should be of type object (string)
object_columns = ['AccID', 'vehicleID', 'num_veh']

# Columns that should be of type int64
int_columns = [
    'day', 'month', 'year', 'time', 'lum', 'atm_condition', 'collision_type',
    'route_category', 'traffic_regime', 'total_number_lanes', 'reserved_lane_code',
    'longitudinal_profile', 'plan', 'surface_condition', 'infra',
    'accident_situation', 'maximum_speed', 'traffic_direction', 'vehicle_category',
    'fixed_obstacle', 'mobile_obstacle', 'initial_impact_point', 'manv',
    'motor', 'seat', 'user_category', 'gravity', 'gender', 'birth_year',
    'reason_travel', 'safety_equipment1', 'age'
]

# Columns that should be of type float64
float_columns = [
    'lat', 'long', 'upstream_terminal_number', 'distance_upstream_terminal'
]

# Convert columns to the correct data types
data[object_columns] = data[object_columns].astype(str)
data[int_columns] = data[int_columns].astype(np.int64)
data[float_columns] = data[float_columns].astype(float)

# Check the data types to confirm the changes
data.dtypes


AccID                          object
day                             int64
month                           int64
year                            int64
time                            int64
lum                             int64
atm_condition                   int64
collision_type                  int64
lat                           float64
long                          float64
route_category                  int64
traffic_regime                  int64
total_number_lanes              int64
reserved_lane_code              int64
longitudinal_profile            int64
upstream_terminal_number      float64
distance_upstream_terminal    float64
plan                            int64
surface_condition               int64
infra                           int64
accident_situation              int64
maximum_speed                   int64
vehicleID                      object
num_veh                        object
traffic_direction               int64
vehicle_category                int64
fixed_obstac

In [5]:
# Selecting the columns to scale
features_to_scale = ['age', 'maximum_speed', 'distance_upstream_terminal', 'total_number_lanes', 'lat', 'long']

# Initialize the scaler 
scaler = StandardScaler()  

# Fit and transform the selected features
data[features_to_scale] = scaler.fit_transform(data[features_to_scale])

# Display the first few rows of the scaled features to confirm
data[features_to_scale].head()

Unnamed: 0,age,maximum_speed,distance_upstream_terminal,total_number_lanes,lat,long
0,-1.141512,0.509356,1.946753,5.385555,0.805698,-0.063104
1,-0.663829,0.509356,1.946753,5.385555,0.805698,-0.063104
2,1.140752,0.509356,1.946753,5.385555,0.805698,-0.063104
3,-0.716905,0.509356,1.782943,-0.340441,0.82129,-0.104276
4,-0.823057,1.4946,0.755404,3.954056,0.823628,-0.124441


In [6]:
data = data.drop(columns=['AccID', 'vehicleID', 'num_veh'])

In [7]:
# Define the feature set (X) and the target (y)
X = data.drop(columns=['gravity'])
y = data['gravity']

# Split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the resulting datasets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((358136, 35), (89534, 35), (358136,), (89534,))

In [None]:
# Fill missing values to avoid errors during model fitting
X_filled = X.fillna(X.median())

# Create and fit the Logistic Regression model
model = LogisticRegression(max_iter=1000, solver='liblinear')

# Fit the model on the entire dataset
model.fit(X_filled, y)

# Get the coefficients of the model
coefficients = model.coef_[0]

# Pair coefficients with their corresponding feature names
feature_coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': coefficients
}).sort_values(by='Coefficient', ascending=False)
feature_coefficients

In [None]:
# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X_filled.values, i) for i in range(len(X.columns))]

vif_data

In [None]:
# Fit Lasso model
lasso = Lasso(alpha=0.01)  # Adjust alpha as needed
lasso.fit(X_filled, y)

# Display non-zero coefficients (selected features)
selected_features = X.columns[lasso.coef_ != 0]
selected_features


In [None]:
# Apply PCA to reduce dimensionality
pca = PCA(n_components=10)  # Set number of components as needed
X_pca = pca.fit_transform(X_filled)

# Use the transformed data for modeling
model.fit(X_pca, y)

In [None]:
# Compute correlation matrix
corr_matrix = X.corr().abs()

# Identify highly correlated features
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]

# Drop highly correlated features
X_reduced = X.drop(columns=to_drop)
X_reduced

In [None]:
# Fit the Ridge regression model
ridge = Ridge(alpha=1.0)  # Adjust alpha as needed
ridge.fit(X_filled, y)

# Create a DataFrame to display each feature and its corresponding coefficient
ridge_coefficients = pd.DataFrame({
    'Feature': X_filled.columns,   # X_filled.columns contains the feature names
    'Coefficient': ridge.coef_     # ridge.coef_ contains the corresponding coefficients
})

# Sort by the absolute value of coefficients to see the most important features first
ridge_coefficients = ridge_coefficients.sort_values(by='Coefficient', ascending=False)

# Display the DataFrame
print(ridge_coefficients)
