In [7]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import ADASYN
from sklearn.utils.class_weight import compute_sample_weight
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [8]:
df_risk = pd.read_csv("../Final/Risk_level_Dataset.csv")
df_risk.head()

Unnamed: 0,id,lat,lon,severity,time_of_day_Night,time_of_day_Morning,time_of_day_Afternoon,time_of_day_Evening,month,day_of_week,risk_level_encoded
0,345906,51.511963,-0.028211,0,1,0,0,0,1,3,1
1,345907,51.371636,-0.117621,0,0,0,0,1,1,2,2
2,345908,51.514951,-0.072747,0,0,0,0,1,1,2,1
3,345909,51.519173,-0.262356,0,0,1,0,0,1,3,1
4,345910,51.565743,-0.136308,0,0,1,0,0,1,3,1


In [9]:
X = df_risk.drop(columns=['severity', 'id'])
y = df_risk['severity']  

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

adasyn = ADASYN(random_state=42)
X_res, y_res = adasyn.fit_resample(X_train, y_train)

sample_weights = compute_sample_weight(class_weight='balanced', y=y_res)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(40500, 9)
(10126, 9)
(40500,)
(10126,)


In [10]:
X_train_risk_level = X_train.copy()
X_test_risk_level = X_test.copy()
y_train_risk_level = y_train.copy()
y_test_risk_level = y_test.copy()  

In [14]:
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

# from imblearn.combine import SMOTEENN
# smote_enn = SMOTEENN(random_state=42)
# X_res, y_res = smote_enn.fit_resample(X_train, y_train)


smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

sample_weights = compute_sample_weight(class_weight='balanced', y=y_res)

# Define the logistic regression model
log_reg = LogisticRegression(multi_class="multinomial", solver="saga", max_iter=500)

# Define parameter grid for GridSearch
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Wider range of regularization strengths
    'penalty': ['l2'],  # 'l2' is the only supported penalty for lbfgs solver
    'solver': ['saga'],  # Compatible with multinomial and L2
    'fit_intercept': [True, False],  # Whether to fit the intercept
    'max_iter': [200, 500, 1000],  # More iterations for convergence
    # Only include class_weight if you're not using sample weights
    # 'class_weight': [None, 'balanced']
}


# Setup GridSearchCV
grid_search = GridSearchCV(log_reg, param_grid, cv=3, scoring='f1_macro', n_jobs=-1)
grid_search.fit(X_res, y_res, sample_weight=sample_weights)  # Use balanced/resampled data here

# Predict with the best model
best_model = grid_search.best_estimator_
y_pred_grid = best_model.predict(X_test)

print("\nBest Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

model_regression = grid_search.best_estimator_




Best Parameters: {'C': 100, 'fit_intercept': True, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'saga'}
Best F1 Score: 0.5078661144790505


Best Parameters: {'C': 0.1, 'fit_intercept': False, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'lbfgs'}
Best F1 Score: 0.4997794116161038

Best Parameters: {'C': 100, 'fit_intercept': False, 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}
Best F1 Score: 0.5084193961509678

Best Parameters: {'C': 100, 'fit_intercept': True, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'saga'}
Best F1 Score: 0.5078661144790505