In [20]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [10]:
accidents = pd.read_csv('./data/Accidents.txt', sep='\t', encoding = "ISO-8859-1")

In [22]:
accidents.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57783 entries, 0 to 57782
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   AccidentId        57783 non-null  int64 
 1   Gravity           57783 non-null  object
 2   Date              57783 non-null  object
 3   Hour              57783 non-null  object
 4   Light             57783 non-null  object
 5   Department        57783 non-null  int64 
 6   Commune           57783 non-null  int64 
 7   InAgglomeration   57783 non-null  object
 8   IntersectionType  57783 non-null  object
 9   Weather           57778 non-null  object
 10  CollisionType     57781 non-null  object
 11  PostalAddress     57228 non-null  object
dtypes: int64(3), object(9)
memory usage: 5.3+ MB


In [14]:
users = pd.read_csv('./data/Users.txt', sep='\t', encoding = "ISO-8859-1")

In [19]:
users.head()

Unnamed: 0,AccidentId,VehicleId,Seat,Category,Gravity,Gender,TripReason,SafetyDevice,SafetyDeviceUsed,PedestrianLocation,PedestrianAction,PedestrianCompany,BirthYear
0,201800000001,A01,1.0,Driver,Unscathed,Male,Leisure,SeatBelt,Yes,,,Unknown,1960.0
1,201800000001,B01,1.0,Driver,InjuredAndHospitalized,Male,,SeatBelt,Yes,,,Unknown,1928.0
2,201800000002,A01,1.0,Driver,Unscathed,Male,,SeatBelt,Yes,,,Unknown,1947.0
3,201800000002,A01,,Pedestrian,MildlyInjured,Male,,Helmet,,OnLane<=OnSidewalk0mCrossing,Crossing,Alone,1959.0
4,201800000003,A01,1.0,Driver,InjuredAndHospitalized,Male,Leisure,Helmet,Yes,,,Unknown,1987.0


In [21]:
users.describe(include='all')
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130169 entries, 0 to 130168
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   AccidentId          130169 non-null  int64  
 1   VehicleId           130169 non-null  object 
 2   Seat                119209 non-null  float64
 3   Category            130169 non-null  object 
 4   Gravity             130169 non-null  object 
 5   Gender              130169 non-null  object 
 6   TripReason          101713 non-null  object 
 7   SafetyDevice        126040 non-null  object 
 8   SafetyDeviceUsed    116075 non-null  object 
 9   PedestrianLocation  10043 non-null   object 
 10  PedestrianAction    10303 non-null   object 
 11  PedestrianCompany   75523 non-null   object 
 12  BirthYear           130139 non-null  float64
dtypes: float64(2), int64(1), object(10)
memory usage: 12.9+ MB


In [37]:
df = pd.read_csv('./src/processed/merged_data.csv')

In [38]:
df.head()

Unnamed: 0,AccidentId,VehicleId,Seat,Category_x,Gravity_x,Gender,TripReason,SafetyDevice,SafetyDeviceUsed,PedestrianLocation,...,Weather,CollisionType,PostalAddress,Direction,Category_y,PassengerNumber,FixedObstacle,MobileObstacle,ImpactPoint,Maneuver
0,201800000001,A01,1.0,Driver,Unscathed,Male,Leisure,SeatBelt,Yes,,...,Normal,2Vehicles-BehindVehicles-Frontal,route des Ansereuilles,Unknown,Car<=3.5T,0,,Vehicle,RightFront,TurnToLeft
1,201800000001,B01,1.0,Driver,InjuredAndHospitalized,Male,,SeatBelt,Yes,,...,Normal,2Vehicles-BehindVehicles-Frontal,route des Ansereuilles,Unknown,Car<=3.5T,0,,Vehicle,LeftFront,NoDirectionChange
2,201800000002,A01,1.0,Driver,Unscathed,Male,,SeatBelt,Yes,,...,VeryGood,NoCollision,Place du général de Gaul,Unknown,Car<=3.5T,0,,Pedestrian,,NoDirectionChange
3,201800000002,A01,,Pedestrian,MildlyInjured,Male,,Helmet,,OnLane<=OnSidewalk0mCrossing,...,VeryGood,NoCollision,Place du général de Gaul,Unknown,Car<=3.5T,0,,Pedestrian,,NoDirectionChange
4,201800000003,A01,1.0,Driver,InjuredAndHospitalized,Male,Leisure,Helmet,Yes,,...,Normal,NoCollision,Rue nationale,Unknown,Motorbike>125cm3,0,StationaryVehicle,Vehicle,Front,NoDirectionChange


# preprocessing

In [127]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.impute import SimpleImputer

In [128]:
y = df['Gravity_x']
X = df.drop(['Gravity_x', 'Gravity_y', 'AccidentId', 'VehicleId'], axis=1)

In [129]:
cat_values = X.select_dtypes(include='object').columns

In [130]:
for cat in cat_values:
    X[cat] = LabelEncoder().fit_transform(X[cat])

In [131]:
num_imputer = SimpleImputer(strategy='constant', fill_value=-1)

In [132]:
preprocessor = ColumnTransformer(
transformers=[
('num', num_imputer, X.columns)]
)

In [133]:
model1 = RandomForestClassifier()
model2 = HistGradientBoostingClassifier()

In [134]:
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('model', model1)
])

In [135]:
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
cross_val_score(my_pipeline, X, y, cv=cv)

array([0.66689713, 0.6654119 , 0.66513021, 0.67053341, 0.66492535])