In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
accidents = pd.read_csv('./data/Accidents.txt', sep='\t', encoding = "ISO-8859-1")

In [5]:
accidents.head()

Unnamed: 0,AccidentId,Gravity,Date,Hour,Light,Department,Commune,InAgglomeration,IntersectionType,Weather,CollisionType,PostalAddress
0,201800000001,NonLethal,2018-01-24,15:05:00,Daylight,590,5,No,Y-type,Normal,2Vehicles-BehindVehicles-Frontal,route des Ansereuilles
1,201800000002,NonLethal,2018-02-12,10:15:00,Daylight,590,11,Yes,Square,VeryGood,NoCollision,Place du général de Gaul
2,201800000003,NonLethal,2018-03-04,11:35:00,Daylight,590,477,Yes,T-type,Normal,NoCollision,Rue nationale
3,201800000004,NonLethal,2018-05-05,17:35:00,Daylight,590,52,Yes,NoIntersection,VeryGood,2Vehicles-Side,30 rue Jules Guesde
4,201800000005,NonLethal,2018-06-26,16:05:00,Daylight,590,477,Yes,NoIntersection,Normal,2Vehicles-Side,72 rue Victor Hugo


In [6]:
users = pd.read_csv('./data/Users.txt', sep='\t', encoding = "ISO-8859-1")

In [7]:
users.head()

Unnamed: 0,AccidentId,VehicleId,Seat,Category,Gravity,Gender,TripReason,SafetyDevice,SafetyDeviceUsed,PedestrianLocation,PedestrianAction,PedestrianCompany,BirthYear
0,201800000001,A01,1.0,Driver,Unscathed,Male,Leisure,SeatBelt,Yes,,,Unknown,1960.0
1,201800000001,B01,1.0,Driver,InjuredAndHospitalized,Male,,SeatBelt,Yes,,,Unknown,1928.0
2,201800000002,A01,1.0,Driver,Unscathed,Male,,SeatBelt,Yes,,,Unknown,1947.0
3,201800000002,A01,,Pedestrian,MildlyInjured,Male,,Helmet,,OnLane<=OnSidewalk0mCrossing,Crossing,Alone,1959.0
4,201800000003,A01,1.0,Driver,InjuredAndHospitalized,Male,Leisure,Helmet,Yes,,,Unknown,1987.0


In [8]:
users.describe(include='all')
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130169 entries, 0 to 130168
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   AccidentId          130169 non-null  int64  
 1   VehicleId           130169 non-null  object 
 2   Seat                119209 non-null  float64
 3   Category            130169 non-null  object 
 4   Gravity             130169 non-null  object 
 5   Gender              130169 non-null  object 
 6   TripReason          101713 non-null  object 
 7   SafetyDevice        126040 non-null  object 
 8   SafetyDeviceUsed    116075 non-null  object 
 9   PedestrianLocation  10043 non-null   object 
 10  PedestrianAction    10303 non-null   object 
 11  PedestrianCompany   75523 non-null   object 
 12  BirthYear           130139 non-null  float64
dtypes: float64(2), int64(1), object(10)
memory usage: 12.9+ MB


In [9]:
df = pd.read_csv('./src/processed/merged_data.csv')

In [10]:
df.head()

Unnamed: 0,AccidentId,VehicleId,Seat,Category_x,Gravity_x,Gender,TripReason,SafetyDevice,SafetyDeviceUsed,PedestrianLocation,...,Weather,CollisionType,PostalAddress,Direction,Category_y,PassengerNumber,FixedObstacle,MobileObstacle,ImpactPoint,Maneuver
0,201800000001,A01,1.0,Driver,Unscathed,Male,Leisure,SeatBelt,Yes,,...,Normal,2Vehicles-BehindVehicles-Frontal,route des Ansereuilles,Unknown,Car<=3.5T,0,,Vehicle,RightFront,TurnToLeft
1,201800000001,B01,1.0,Driver,InjuredAndHospitalized,Male,,SeatBelt,Yes,,...,Normal,2Vehicles-BehindVehicles-Frontal,route des Ansereuilles,Unknown,Car<=3.5T,0,,Vehicle,LeftFront,NoDirectionChange
2,201800000002,A01,1.0,Driver,Unscathed,Male,,SeatBelt,Yes,,...,VeryGood,NoCollision,Place du général de Gaul,Unknown,Car<=3.5T,0,,Pedestrian,,NoDirectionChange
3,201800000002,A01,,Pedestrian,MildlyInjured,Male,,Helmet,,OnLane<=OnSidewalk0mCrossing,...,VeryGood,NoCollision,Place du général de Gaul,Unknown,Car<=3.5T,0,,Pedestrian,,NoDirectionChange
4,201800000003,A01,1.0,Driver,InjuredAndHospitalized,Male,Leisure,Helmet,Yes,,...,Normal,NoCollision,Rue nationale,Unknown,Motorbike>125cm3,0,StationaryVehicle,Vehicle,Front,NoDirectionChange


# preprocessing and model

On veut predire l'attribut ``` Gravity_x ``` à partir de toutes les autres attributs

In [29]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.impute import SimpleImputer

In [30]:
y = df['Gravity_x']
X = df.drop(['Gravity_x', 'Gravity_y', 'AccidentId', 'VehicleId'], axis=1)

In [31]:
cat_values = X.select_dtypes(include='object').columns

In [32]:
for cat in cat_values:
    X[cat] = LabelEncoder().fit_transform(X[cat])

In [33]:
num_imputer = SimpleImputer(strategy='constant', fill_value=-1)

In [34]:
preprocessor = ColumnTransformer(
transformers=[
('num', num_imputer, X.columns)]
)

In [35]:
model1 = RandomForestClassifier()
model2 = HistGradientBoostingClassifier()

In [36]:
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('model', model1)
])

In [37]:
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
cross_val_score(my_pipeline, X, y, cv=cv, scoring='balanced_accuracy')

array([0.47662054, 0.47997184, 0.47921476, 0.4803315 , 0.4790923 ])