In [2]:
import pandas as pd 
import numpy  as np
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
train_df = pd.read_csv("datasets/train.csv")
train_df.head()

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [8]:
train_df.shape

(517754, 14)

In [9]:
train_df.columns

Index(['id', 'road_type', 'num_lanes', 'curvature', 'speed_limit', 'lighting',
       'weather', 'road_signs_present', 'public_road', 'time_of_day',
       'holiday', 'school_season', 'num_reported_accidents', 'accident_risk'],
      dtype='object')

In [11]:
test_df = pd.read_csv("datasets/test.csv")
test_df.head()

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents
0,517754,highway,2,0.34,45,night,clear,True,True,afternoon,True,True,1
1,517755,urban,3,0.04,45,dim,foggy,True,False,afternoon,True,False,0
2,517756,urban,2,0.59,35,dim,clear,True,False,afternoon,True,True,1
3,517757,rural,4,0.95,35,daylight,rainy,False,False,afternoon,False,False,2
4,517758,highway,2,0.86,35,daylight,clear,True,False,evening,False,True,3


In [14]:
test_df.shape

(172585, 14)

In [13]:
test_df['accident_risk'] = 0

In [15]:
merged_df = pd.concat([train_df, test_df], axis=0)

In [16]:
merged_df.head()

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [17]:
merged_df.shape

(690339, 14)

In [18]:
merged_df.duplicated().sum()

np.int64(0)

In [19]:
merged_df.isnull().sum()

id                        0
road_type                 0
num_lanes                 0
curvature                 0
speed_limit               0
lighting                  0
weather                   0
road_signs_present        0
public_road               0
time_of_day               0
holiday                   0
school_season             0
num_reported_accidents    0
accident_risk             0
dtype: int64

In [22]:
object_cols = merged_df.select_dtypes(include=['object']).columns

In [24]:
for i in object_cols:
    print(merged_df[i].value_counts())
    print("-"*100)

road_type
highway    231752
rural      230128
urban      228459
Name: count, dtype: int64
----------------------------------------------------------------------------------------------------
lighting
dim         244969
daylight    237412
night       207958
Name: count, dtype: int64
----------------------------------------------------------------------------------------------------
weather
foggy    241699
clear    239288
rainy    209352
Name: count, dtype: int64
----------------------------------------------------------------------------------------------------
time_of_day
morning      231157
evening      230466
afternoon    228716
Name: count, dtype: int64
----------------------------------------------------------------------------------------------------


In [25]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe_df = pd.DataFrame(ohe.fit_transform(merged_df[object_cols]))
ohe_df.columns = ohe.get_feature_names_out(object_cols)
ohe_df.index = merged_df.index
merged_df = pd.concat([merged_df, ohe_df], axis=1)
merged_df = merged_df.drop(object_cols, axis=1)
merged_df.shape
merged_df.head()

Unnamed: 0,id,num_lanes,curvature,speed_limit,road_signs_present,public_road,holiday,school_season,num_reported_accidents,accident_risk,...,road_type_urban,lighting_daylight,lighting_dim,lighting_night,weather_clear,weather_foggy,weather_rainy,time_of_day_afternoon,time_of_day_evening,time_of_day_morning
0,0,2,0.06,35,False,True,False,True,1,0.13,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,1,4,0.99,35,True,False,True,True,0,0.35,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,2,4,0.63,70,False,True,True,False,2,0.3,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,3,4,0.07,35,True,True,False,False,1,0.21,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,4,1,0.58,60,False,False,True,False,1,0.56,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [26]:
merged_df.shape

(690339, 22)

In [27]:
train = merged_df[:len(train_df)]
test = merged_df[len(train_df):]

In [28]:
X = train.drop("accident_risk", axis=1)
y = train['accident_risk']

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 0.005406767723981864
R^2 Score: 0.8041891320847725


In [31]:
test_preds = model.predict(test.drop("accident_risk", axis=1))
submission_df = pd.DataFrame({"id": test["id"],
"accident_risk": test_preds})
submission_df.to_csv("submission.csv", index=False)