In [30]:
import pandas as pd
 
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517754 entries, 0 to 517753
Data columns (total 14 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   id                      517754 non-null  int64  
 1   road_type               517754 non-null  object 
 2   num_lanes               517754 non-null  int64  
 3   curvature               517754 non-null  float64
 4   speed_limit             517754 non-null  int64  
 5   lighting                517754 non-null  object 
 6   weather                 517754 non-null  object 
 7   road_signs_present      517754 non-null  bool   
 8   public_road             517754 non-null  bool   
 9   time_of_day             517754 non-null  object 
 10  holiday                 517754 non-null  bool   
 11  school_season           517754 non-null  bool   
 12  num_reported_accidents  517754 non-null  int64  
 13  accident_risk           517754 non-null  float64
dtypes: bool(4), float64(

In [31]:
#Enocoding
from sklearn.preprocessing import LabelEncoder

objects = [
    "road_type",
    "lighting",
    "weather",
    "road_signs_present",
    "public_road",
    "time_of_day",
    "holiday",
    "school_season",
]

for object in objects:
    le = LabelEncoder()
    combined = pd.concat([train_data[object] , test_data[object]] , axis = 0).astype(str)
    le.fit(combined)
    train_data[object] = le.transform(train_data[object].astype(str))
    test_data[object] = le.transform(test_data[object].astype(str))


In [32]:
train_data.drop(["id"] , axis= 1)
test_data.drop(["id"] , axis= 1)

Unnamed: 0,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents
0,0,2,0.34,45,2,0,1,1,0,1,1,1
1,2,3,0.04,45,1,1,1,0,0,1,0,0
2,2,2,0.59,35,1,0,1,0,0,1,1,1
3,1,4,0.95,35,0,2,0,0,0,0,0,2
4,0,2,0.86,35,0,0,1,0,1,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...
172580,1,2,0.01,45,1,2,0,0,0,1,1,2
172581,1,1,0.74,70,0,1,0,1,0,0,0,2
172582,2,2,0.14,70,1,0,0,0,1,1,1,1
172583,2,1,0.09,45,0,1,1,1,2,0,1,0


In [33]:
x = train_data.drop(["accident_risk"] , axis= 1)
y = train_data["accident_risk"]

In [34]:
import xgboost as xgb

model = xgb.XGBRegressor(
    n_estimators = 100,
    max_depth = 3,
    min_child_weight = 5,
    colsample_bytree = 0.8 , 
    learning_rate = 0.1 ,
    random_state = 42,
    n_jobs = -1

)


In [35]:
model.fit(x , y)

In [None]:
#Hyper-parameter Tuning
from sklearn.model_selection import RandomizedSearchCV

parms_distribution = {
    'n_estimators': [50, 60, 70, 90, 100],
    'max_depth': [3, 4, 5],
    'min_child_weight': [3, 4, 5, 6],  
    'colsample_bytree': [0.8, 0.85, 0.9, 1.0]
}

random_search = RandomizedSearchCV(
    estimator= model ,
    param_distributions=parms_distribution,
    n_iter= 50 , 
    cv=3 , 
    random_state= 42,
    n_jobs=-1
)
random_search.fit(x , y)

In [44]:
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import cross_val_score

x_predicted = random_search.predict(x)
mse_error = mean_squared_error(y , x_predicted)
x_predicted_1 = cross_val_score(model , x , y , scoring="neg_mean_absolute_error" , cv= 3 , n_jobs=-1)
print(f"The rmse error is {np.sqrt(mse_error):.4f}")
print(f"The cross errror is {-x_predicted_1.mean():.4f}")

The rmse error is 0.0562
The cross errror is 0.0443


In [45]:
#Test_set prediction
test_predict = model.predict(test_data)

In [46]:
#Submisson
test_data_1 = pd.read_csv("test.csv")

submisson = pd.DataFrame({
    "id" : test_data["id"],
    "accident_risk" : test_predict
})

submisson.to_csv('submisson.csv' , index=False)