In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

In [3]:
# Load dataset
data = pd.read_csv('source/data.csv')

In [4]:
# Convert all columns to numeric, setting errors to 'coerce'
data = data.apply(pd.to_numeric, errors='coerce')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 447670 entries, 0 to 447669
Data columns (total 39 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   AccID                       447670 non-null  int64  
 1   day                         447670 non-null  int64  
 2   month                       447670 non-null  int64  
 3   year                        447670 non-null  int64  
 4   time                        447670 non-null  int64  
 5   lum                         447670 non-null  int64  
 6   atm_condition               447670 non-null  int64  
 7   collision_type              447670 non-null  int64  
 8   lat                         447670 non-null  float64
 9   long                        447670 non-null  float64
 10  route_category              447670 non-null  int64  
 11  traffic_regime              447670 non-null  int64  
 12  total_number_lanes          447670 non-null  int64  
 13  reserved_lane_

In [5]:
# 2. One-Hot Encoding for Nominal Variables
nominal_cols = ['atm_condition', 'collision_type', 'route_category', 'traffic_regime', 
                'vehicle_category', 'fixed_obstacle', 'mobile_obstacle', 'user_category', 
                'reason_travel', 'safety_equipment1']

data = pd.get_dummies(data, columns=nominal_cols, drop_first=True)

In [4]:
# 1. Label Encoding for Ordinal Variables
ordinal_cols = ['lum', 'surface_condition', 'plan', 'longitudinal_profile']
le = LabelEncoder()
for col in ordinal_cols:
    data[col] = le.fit_transform(data[col])

In [6]:
# 3. Define the target variable and features
X = data.drop(columns=['gravity'])  # Explanatory variables
y = data['gravity']  # Target variable

In [7]:
# 4. Handle missing values using imputation
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(X)



In [8]:
# 5. Standardization of Quantitative Variables
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [9]:
# 6. Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
# 7. Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

In [11]:
# 8. Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.73      0.80      0.76     56272
           2       0.12      0.60      0.20      3463
           3       0.38      0.37      0.38     20458
           4       0.70      0.45      0.54     54108

    accuracy                           0.59    134301
   macro avg       0.48      0.55      0.47    134301
weighted avg       0.65      0.59      0.60    134301



In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
# Define the parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],        # L1 = Lasso, L2 = Ridge
    'solver': ['liblinear']         # Use 'liblinear' solver for L1/L2 regularization
}

In [14]:
# Initialize the model
model = LogisticRegression(class_weight='balanced', max_iter=1000)

In [15]:
# Set up Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro')

In [16]:
# Fit the model
grid_search.fit(X_train, y_train)

In [17]:
# Print the best parameters and the best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best F1-score: {grid_search.best_score_}")

Best parameters: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best F1-score: 0.5025635764983469


In [18]:
# Use the best model for predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.71      0.83      0.77     56272
           2       0.16      0.35      0.22      3463
           3       0.45      0.41      0.43     20458
           4       0.68      0.54      0.60     54108

    accuracy                           0.63    134301
   macro avg       0.50      0.53      0.50    134301
weighted avg       0.64      0.63      0.63    134301

