In [1]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv('train.csv')

In [4]:
df.shape

(43550, 14)

In [5]:
df=df.dropna()

In [6]:
df.shape

(43549, 14)

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

class AccidentDataEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mappings = {
            'road_type': {'highway': 1, 'rural': 0, 'urban': 2},
            'lighting': {'dim': 1, 'night': 0, 'daylight': 2},
            'weather': {'foggy': 1, 'clear': 0, 'rainy': 2},
            'time_of_day': {'morning': 1, 'evening': 0, 'afternoon': 2}
        }
        self.boolean_cols = ['road_signs_present', 'public_road', 'holiday', 'school_season']

    def fit(self, X, y=None):
        # Nothing to learn — just return self
        return self

    def transform(self, X):
        X = X.copy()

        # Apply categorical mappings
        for col, mapping in self.mappings.items():
            if col in X.columns:
                X[col] = X[col].map(mapping)

        # Apply boolean encoding
        for col in self.boolean_cols:
            if col in X.columns:
                X[col] = X[col].astype(str).str.lower().map({'true': 1, 'false': 0})

        return X

In [8]:
X = df.drop(columns=['id', 'accident_risk'])
y = df['accident_risk']

In [9]:
pipeline = Pipeline([
    ('encoder', AccidentDataEncoder())
])

# Transform both train and test datasets
X_encoded = pipeline.fit_transform(X)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import BaggingRegressor

rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    min_samples_split=4,
    min_samples_leaf=2,
    max_features="sqrt",
    max_samples=0.5,
    bootstrap=True,
    n_jobs=-1,
    random_state=42
)

In [11]:
rf.fit(X_train, y_train)

In [12]:
y_pred_rf=rf.predict(X_test)

In [13]:
r2_rf = r2_score(y_test, y_pred_rf)
rmse_rf = mean_squared_error(y_test, y_pred_rf)

In [14]:
print(f"R²: {r2_rf:.4f}")
print(f"RMSE: {rmse_rf:.4f}")

R²: 0.8725
RMSE: 0.0034


In [53]:


from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=4,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0,
    reg_alpha=0.0,
    reg_lambda=1.0,
    objective="reg:squarederror",
    n_jobs=-1,
    random_state=42
)


In [54]:
model.fit(X_train, y_train)

In [55]:
y_pred_xg=model.predict(X_test)

In [56]:
r2_xg = r2_score(y_test, y_pred_xg)
rmse_xg = mean_squared_error(y_test, y_pred_xg)

In [57]:
print(f"R²: {r2_xg:.4f}")
print(f"RMSE: {rmse_xg:.4f}")

R²: 0.8775
RMSE: 0.0033
