In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

In [None]:
# Load dataset
data = pd.read_csv('source/data.csv')

In [None]:
# Convert all columns to numeric, setting errors to 'coerce'
data = data.apply(pd.to_numeric, errors='coerce')

In [None]:
# 1. Label Encoding for Ordinal Variables
ordinal_cols = ['lum', 'surface_condition', 'plan', 'longitudinal_profile']
le = LabelEncoder()
for col in ordinal_cols:
    data[col] = le.fit_transform(data[col])

In [None]:
# 2. One-Hot Encoding for Nominal Variables
nominal_cols = ['atm_condition', 'collision_type', 'route_category', 'traffic_regime', 
                'vehicle_category', 'fixed_obstacle', 'mobile_obstacle', 'user_category', 
                'reason_travel', 'safety_equipment1']

data = pd.get_dummies(data, columns=nominal_cols, drop_first=True)

In [None]:
# 3. Define the target variable and features
X = data.drop(columns=['gravity'])  # Explanatory variables
y = data['gravity']  # Target variable

In [None]:
# 4. Handle missing values using imputation
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(X)

In [None]:
# 5. Standardization of Quantitative Variables
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# 6. Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# 7. Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

In [None]:
# 8. Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Define the parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],        # L1 = Lasso, L2 = Ridge
    'solver': ['liblinear']         # Use 'liblinear' solver for L1/L2 regularization
}

In [None]:
# Initialize the model
model = LogisticRegression(class_weight='balanced', max_iter=1000)

In [None]:
# Set up Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro')

In [None]:
# Fit the model
grid_search.fit(X_train, y_train)

In [None]:
# Print the best parameters and the best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best F1-score: {grid_search.best_score_}")

In [None]:
# Use the best model for predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))