In [1]:
import pandas as pd
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer, precision_score, recall_score
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.tree import export_graphviz,export_text
from subprocess import call

In [2]:
dataframes = []
dossier = "Data/"

for annee in range(2012, 2023):
    chemin_caracteristiques = os.path.join(dossier, f"BAAC-Annee-{annee}/caracteristiques_{annee}_.csv")
    chemin_lieux = os.path.join(dossier, f"BAAC-Annee-{annee}/lieux_{annee}_.csv")
    chemin_usagers = os.path.join(dossier, f"BAAC-Annee-{annee}/usagers_{annee}_.csv")
    chemin_vehicules = os.path.join(dossier, f"BAAC-Annee-{annee}/vehicules_{annee}_.csv")
    
    carac = pd.read_csv(chemin_caracteristiques, delimiter=';', encoding='ISO-8859-1', usecols=lambda x: x != 'Unnamed: 0')
    lieux = pd.read_csv(chemin_lieux, delimiter=';', encoding='ISO-8859-1', usecols=lambda x: x != 'Unnamed: 0')
    usagers = pd.read_csv(chemin_usagers, delimiter=';', encoding='ISO-8859-1', usecols=lambda x: x != 'Unnamed: 0')
    vehicules = pd.read_csv(chemin_vehicules, delimiter=';', encoding='ISO-8859-1', usecols=lambda x: x != 'Unnamed: 0')
    
    df = pd.merge(carac, lieux, on="Num_Acc")
    df = pd.merge(df, vehicules, on="Num_Acc")
    df = pd.merge(df, usagers, on="Num_Acc")
    
    dataframes.append(df)

df_final = pd.concat(dataframes, ignore_index=True)

  lieux = pd.read_csv(chemin_lieux, delimiter=';', encoding='ISO-8859-1', usecols=lambda x: x != 'Unnamed: 0')
  lieux = pd.read_csv(chemin_lieux, delimiter=';', encoding='ISO-8859-1', usecols=lambda x: x != 'Unnamed: 0')
  lieux = pd.read_csv(chemin_lieux, delimiter=';', encoding='ISO-8859-1', usecols=lambda x: x != 'Unnamed: 0')
  lieux = pd.read_csv(chemin_lieux, delimiter=';', encoding='ISO-8859-1', usecols=lambda x: x != 'Unnamed: 0')


In [3]:
data_1 = df_final

In [4]:
data_1['hrmn'] = data_1['hrmn'].astype(str).str.replace(':', '')

def map_hours_to_category(hour):
    if 0 <= hour < 600:
        return 0
    elif 600 <= hour < 1200:
        return 1
    elif 1200 <= hour < 1800:
        return 2
    else:
        return 3
data_1['hrmn'] = data_1['hrmn'].astype(int)   
data_1['hrmn'] = data_1['hrmn'].apply(map_hours_to_category)

In [5]:
data_1['hrmn'].unique()

array([3, 2, 0, 1], dtype=int64)

In [6]:
data_1['grav'] = data_1['grav'].replace({1: 0, 4: 0, 2: 1, 3: 1})

In [7]:
cols_to_drop = ['mois','jour', 'com','dep', 'adr', 'gps', 'lat', 'long', 'voie', 'v1', 'v2', 'pr', 'pr1', 'num_veh_x', 'id_vehicule_x','id_vehicule_y','num_veh_y', 'id_usager','motor','secu2','secu3']
data_1.drop(columns=cols_to_drop, inplace=True)

In [8]:
data_1['age_conducteur'] = data_1['an'] - data_1['an_nais']

def categorize_age(age):
    if pd.isna(age):
        return -1
    elif age <= 20:
        return 1
    elif 21 <= age <= 40:
        return 2
    elif 41 <= age <= 65:
        return 3
    elif age > 65:
        return 4
    else:
        return -1 

data_1['age_conducteur'] = data_1['age_conducteur'].apply(categorize_age)

In [9]:
data_1.drop(columns=['an', 'an_nais'], inplace=True)

In [10]:
def combine_secu(secu, secu1):
    base_protection = {1, 2}
    child_protection = {3}
    advanced_protection = {4, 5, 6, 7}
    
    secu_set = {secu, secu1}
    
    if secu_set.intersection({-1, 0}):  
        return -1  
    elif secu_set.intersection(base_protection):
        return 1  
    elif secu_set.intersection(child_protection):
        return 2  
    elif secu_set.intersection(advanced_protection):
        return 3  
    else:
        return -1 

data_1['secu_combined'] = data_1.apply(lambda row: combine_secu(row['secu'], row['secu1']), axis=1)

In [11]:
data_1.drop(columns=['secu', 'secu1'], inplace=True)

In [12]:
def categorize_catv(catv):
    if pd.isna(catv):
        return -1  
    if catv in {1, 30, 31, 32, 33, 34, 41, 42, 43}:
        return 1  # Bicyclettes et deux roues
    if catv in {2, 3, 7, 10, 11, 12}:
        return 2  # Voitures et voitures légères
    if catv in {13, 14, 15, 16, 17, 18, 20, 21}:
        return 3  # Véhicules lourds
    if catv in {37, 38, 39, 40}:
        return 4  # Transports en commun
    if catv in {50, 60, 80, 99}:
        return 5  # Engins spéciaux et autres
    return -1 
data_1['catv'] = data_1['catv'].apply(categorize_catv)

In [13]:
#data_1['age_conducteur'] = data_1['age_conducteur'].astype('object')
#data_1['secu_combined'] = data_1['secu_combined'].astype('object')

In [14]:
def fill_vma(row):
    if pd.isna(row['vma']):
        if row['catr'] == 1:
            return 130  # Autoroute
        elif row['catr'] == 2 or row['catr'] == 3:
            return 80  # Route nationale ou départementale
        elif row['catr'] == 4 or row['catr'] == 5 or row['catr'] == 9:
            return 50  # Voie communale, Hors réseau public, Autre
        elif row['catr'] == 6:
            return 30  # Parc de stationnement
        elif row['catr'] == 7:
            return 50  # Routes de métropole urbaine
        else:
            return 50  # Valeur par défaut si catr non spécifié
    else:
        return row['vma']

data_1['vma'] = data_1.apply(fill_vma, axis=1)

In [15]:
def fill_and_correct_vma(row):
    speed_limits = {
        1: 130,  # Autoroute
        2: 80,   # Route nationale
        3: 80,   # Route départementale
        4: 50,   # Voie communale
        5: 50,   # Hors réseau public
        6: 30,   # Parc de stationnement
        7: 50,   # Routes de métropole urbaine
        9: 50    # Autre
    }

    if pd.isna(row['vma']) or row['vma'] > 130 or row['vma'] < 0:
        return speed_limits.get(row['catr'], 50)
    else:
        if row['vma'] not in speed_limits.values():
            return speed_limits.get(row['catr'], 50)  
        else:
            return row['vma']

data_1['vma'] = data_1.apply(fill_and_correct_vma, axis=1)

In [16]:
data_1['nbv'] = pd.to_numeric(data_1['nbv'].astype(str).str.strip(), errors='coerce')
data_1['nbv'] = data_1['nbv'].fillna(-1).astype(int)
data_1['nbv'] = data_1['nbv'].apply(lambda x: x if x >= 0 else -1)

In [17]:
def fill_occutc(row):
    if pd.isna(row['occutc']):
        if row['catv'] in {1, 5}:
            return 1
        else:
            return mean_occupants.get(row['catv'], -1) 
    else:
        return row['occutc']

mean_occupants = {
    cat: data_1[data_1['catv'] == cat]['occutc'].mean()
    for cat in data_1['catv'].unique() if cat not in {1, 5}
}

data_1['occutc'] = data_1.apply(fill_occutc, axis=1)

In [18]:
data_1.drop(columns=['lartpc', 'larrout','env1','place'], inplace=True)

In [19]:
cols_to_fill = ['atm', 'col', 'circ', 'vosp', 'prof', 'plan', 'surf', 'infra', 'situ','senc', 'occutc', 'obs', 'obsm', 'choc', 'manv', 'trajet', 'locp', 'etatp', 'actp']
data_1[cols_to_fill] = data_1[cols_to_fill].fillna(-1)

In [20]:
def clean_actp(value):
    if value in ['B', 0, '0', ' -1', -1]:
        return -1
    elif value == 'A':
        return 10
    else:
        try:
            return int(value)  
        except:
            return -1 

data_1['actp'] = data_1['actp'].apply(clean_actp)

In [21]:
data_1.head()

Unnamed: 0,Num_Acc,hrmn,lum,agg,int,atm,col,catr,circ,nbv,...,catu,grav,sexe,trajet,locp,actp,etatp,vma,age_conducteur,secu_combined
0,201200000001,3,5,2,1,1.0,6.0,3,2.0,0,...,1,0,2,5.0,0.0,-1,0.0,80.0,1,-1
1,201200000001,3,5,2,1,1.0,6.0,3,2.0,0,...,3,0,1,5.0,3.0,3,2.0,80.0,1,-1
2,201200000002,3,5,2,1,1.0,3.0,3,2.0,2,...,1,0,1,5.0,0.0,-1,0.0,80.0,1,-1
3,201200000002,3,5,2,1,1.0,3.0,3,2.0,2,...,2,0,1,0.0,0.0,-1,0.0,80.0,1,-1
4,201200000002,3,5,2,1,1.0,3.0,3,2.0,2,...,1,0,1,5.0,0.0,-1,0.0,80.0,1,-1


In [22]:
data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2437087 entries, 0 to 2437086
Data columns (total 33 columns):
 #   Column          Dtype  
---  ------          -----  
 0   Num_Acc         int64  
 1   hrmn            int64  
 2   lum             int64  
 3   agg             int64  
 4   int             int64  
 5   atm             float64
 6   col             float64
 7   catr            int64  
 8   circ            float64
 9   nbv             int64  
 10  vosp            float64
 11  prof            float64
 12  plan            float64
 13  surf            float64
 14  infra           float64
 15  situ            float64
 16  senc            float64
 17  catv            int64  
 18  occutc          float64
 19  obs             float64
 20  obsm            float64
 21  choc            float64
 22  manv            float64
 23  catu            int64  
 24  grav            int64  
 25  sexe            int64  
 26  trajet          float64
 27  locp            float64
 28  actp        

In [63]:
max_cols = ['hrmn', 'vma', 'age_conducteur', 'occutc', 'catu','nbv','grav']
mode_cols = ['lum', 'agg', 'int', 'atm', 'col', 'circ', 'vosp', 'prof', 'plan', 'surf', 'infra', 'situ', 'obs', 'obsm', 'manv', 'trajet', 'locp', 'etatp', 'actp']
target_cols = ['catv', 'col','vosp','senc', 'choc', 'secu_combined', 'sexe']

In [64]:
def process_duplicates(df, target_cols, max_cols, mode_cols):
    grouped = df.groupby('Num_Acc')
    processed_data = []

    for _, group in grouped:
        processed_row = {}
        for col in df.columns:
            if col in max_cols:
                processed_row[col] = group[col].max()
            elif col in mode_cols:
                mode_values = group[col].mode()
                processed_row[col] = mode_values[0] if not mode_values.empty else -1
            elif col in target_cols:
                if 1 in group[col].values:
                    processed_row[col] = 1
                else:
                    mode_values = group[col].mode()
                    processed_row[col] = mode_values[0] if not mode_values.empty else -1
            else:
                processed_row[col] = group[col].iloc[0]

        processed_data.append(processed_row)

    processed_df = pd.DataFrame(processed_data)
    return processed_df

Attention 17 min pour run ca 

In [65]:
processed_df = process_duplicates(data_1, target_cols=target_cols, max_cols=max_cols, mode_cols=mode_cols)

In [66]:
processed_df['grav'].unique()

array([0, 1], dtype=int64)

In [68]:
processed_df.head()

Unnamed: 0,Num_Acc,hrmn,lum,agg,int,atm,col,catr,circ,nbv,...,catu,grav,sexe,trajet,locp,actp,etatp,vma,age_conducteur,secu_combined
0,201200000001,3,5,2,1,1.0,6.0,3,2.0,0,...,3,0,1,5.0,0.0,-1,0.0,80.0,1,-1
1,201200000002,3,5,2,1,1.0,3.0,3,2.0,2,...,2,1,1,0.0,0.0,-1,0.0,80.0,1,-1
2,201200000003,3,5,2,1,1.0,5.0,3,2.0,2,...,2,1,1,5.0,0.0,-1,0.0,80.0,1,-1
3,201200000004,3,5,2,1,1.0,3.0,3,2.0,2,...,2,0,1,5.0,0.0,-1,0.0,80.0,1,-1
4,201200000006,3,5,2,1,1.0,6.0,4,2.0,0,...,3,0,1,5.0,0.0,-1,0.0,50.0,1,1


In [69]:
processed_df = processed_df.drop_duplicates()

DUMMIESSSSS

In [29]:
columns_to_convert = ['hrmn','agg','lum','int','atm','col','catr','circ','vosp','prof','plan','surf','infra','situ',
                      'senc','catv','obs','obsm','choc','manv','catu','sexe','trajet','vma','locp','actp','etatp',
                      'age_conducteur','secu_combined']

df_dummies = pd.get_dummies(data_1, columns=columns_to_convert, drop_first=True)

SUBSET FORWARD SELECTION

In [91]:
sampled_data = processed_df.sample(n=60000, random_state=42)

X = sampled_data.drop(columns=['grav'])
y = sampled_data['grav']

In [92]:
from sklearn.linear_model import Lasso

In [93]:
lasso_model = Lasso(alpha=0.002)
lasso_model.fit(X, y)

In [94]:
selected_columns = X.columns[lasso_model.coef_ != 0]
print("Variables sélectionnées par Lasso:", selected_columns)

Variables sélectionnées par Lasso: Index(['Num_Acc', 'hrmn', 'lum', 'agg', 'int', 'atm', 'col', 'catr', 'circ',
       'nbv', 'prof', 'plan', 'surf', 'infra', 'situ', 'senc', 'catv',
       'occutc', 'obs', 'obsm', 'choc', 'manv', 'catu', 'sexe', 'trajet',
       'locp', 'actp', 'etatp', 'vma', 'age_conducteur', 'secu_combined'],
      dtype='object')


TRAINNING

In [95]:
sample_data = processed_df.sample(n=70000, random_state=42)

X1 = sample_data[selected_columns]
y1 = sample_data['grav']

X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.3, random_state=42)

In [96]:
sample_data['grav'].unique()

array([0, 1], dtype=int64)

In [32]:
gb=GradientBoostingClassifier(random_state=42)
param_grid = { 
    'learning_rate' :  [0.01,0.05,0.1, 0.25, 0.5, 0.75,1]
}
CV_gb = GridSearchCV(estimator=gb, param_grid=param_grid, cv= 5)
CV_gb.fit(X1, y1)

In [33]:
CV_gb.best_params_

{'learning_rate': 0.5}

In [97]:
gb = GradientBoostingClassifier(learning_rate=0.5)
scores = cross_validate(gb, X_train, y_train,scoring=('accuracy', 'precision','recall'), cv=10)

In [98]:
print(scores['test_accuracy'].mean())
print(scores['test_precision'].mean())

0.736265306122449
0.7192132149265096


In [99]:
gb.fit(X_train, y_train)

predictions = gb.predict(X_test) 
accuracy = accuracy_score(y_test,predictions)
print("Accuracy :", accuracy)

Accuracy : 0.7378095238095238


In [105]:
predictions_prob = gb.predict_proba(X_test)[:, 1]

In [187]:
imp = zip(selected_columns,gb.feature_importances_)
for name,val in imp : 
    print(name,  val)

Num_Acc 0.04698649997565297
nbv 0.01965426619339307
hrmn_2 0.0019486371610383025
agg_2 0.06750897681397633
lum_1 0.0010605615366429755
lum_3 0.005688465061953477
int_1 0.005376481574938257
col_1.0 0.02401423170129951
col_2.0 0.004483909109577591
col_4.0 0.0027722237131417395
col_6.0 0.0045298368749534905
col_7.0 0.005992449991019439
catr_3 0.011967805747036632
circ_1.0 0.008409566656781746
circ_2.0 0.049522101548632534
plan_1.0 0.005033578005685117
surf_1.0 0.0030879839201426026
surf_2.0 0.0006472060512030777
infra_0.0 0.004348098217711117
situ_1.0 0.014791605001711254
situ_3.0 0.004523672039223396
senc_2.0 0.0017591826193430516
catv_1 0.04609573536272075
catv_2 0.03835587444425112
obs_0.0 0.05610569382134612
obs_2.0 0.018799563434276807
obs_6.0 0.007630153650431267
obsm_0.0 0.18130272371181527
obsm_1.0 0.015136716708388667
choc_1.0 0.002648600741411299
manv_0.0 0.004783372617424697
manv_1.0 0.004985132844569885
manv_13.0 0.005437314453750273
catu_2 0.008758320410499964
catu_3 0.086270

In [106]:
conf_matrix = confusion_matrix(y_test, predictions)

print(conf_matrix)

[[10599  1912]
 [ 3594  4895]]


In [107]:
fpr, tpr, thresholds = roc_curve(y_test, predictions_prob)
roc_auc = auc(fpr, tpr)
print("AUC :", roc_auc)

# Tracer la courbe ROC
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

TypeError: 'numpy.float64' object is not callable

In [108]:
auc = roc_auc_score(y_test,predictions_prob)
auc

0.7919437538857901

Test

In [109]:
carac_test = pd.read_csv("Data/TEST/CARACTERISTIQUES.csv",delimiter=',', encoding= 'ISO-8859-1', usecols=lambda x: x != 'Unnamed: 0')
lieux_test = pd.read_csv("Data/TEST/LIEUX.csv",delimiter=',', encoding= 'ISO-8859-1', usecols=lambda x: x != 'Unnamed: 0')
usa_test = pd.read_csv("Data/TEST/USAGERS.csv",delimiter=',', encoding= 'ISO-8859-1', usecols=lambda x: x != 'Unnamed: 0')
vehi_test = pd.read_csv("Data/TEST/VEHICULES.csv",delimiter=',', encoding= 'ISO-8859-1', usecols=lambda x: x != 'Unnamed: 0')

  carac_test = pd.read_csv("Data/TEST/CARACTERISTIQUES.csv",delimiter=',', encoding= 'ISO-8859-1', usecols=lambda x: x != 'Unnamed: 0')
  lieux_test = pd.read_csv("Data/TEST/LIEUX.csv",delimiter=',', encoding= 'ISO-8859-1', usecols=lambda x: x != 'Unnamed: 0')
  usa_test = pd.read_csv("Data/TEST/USAGERS.csv",delimiter=',', encoding= 'ISO-8859-1', usecols=lambda x: x != 'Unnamed: 0')
  vehi_test = pd.read_csv("Data/TEST/VEHICULES.csv",delimiter=',', encoding= 'ISO-8859-1', usecols=lambda x: x != 'Unnamed: 0')


In [110]:
df_pred = pd.merge(carac_test, lieux_test, on="Num_Acc")
df_pred = pd.merge(df_pred, vehi_test, on="Num_Acc")
df_pred = pd.merge(df_pred, usa_test, on="Num_Acc")

In [111]:
# 1ère transformation : Conversion de la colonne 'hrmn' en catégories
df_pred['hrmn'] = df_pred['hrmn'].astype(str).str.replace(':', '')
df_pred['hrmn'] = df_pred['hrmn'].astype(int)
df_pred['hrmn'] = df_pred['hrmn'].apply(map_hours_to_category)

# 4ème transformation : Suppression de colonnes spécifiques
cols_to_drop = ['mois','jour', 'com','dep', 'adr', 'gps', 'lat', 'long', 'voie', 'v1', 'v2', 'pr', 'pr1', 'num_veh_x', 'id_vehicule_x','id_vehicule_y','num_veh_y', 'id_usager','motor','secu2','secu3']
df_pred.drop(columns=cols_to_drop, inplace=True)

# 5ème transformation : Calcul de l'âge du conducteur et catégorisation
df_pred['age_conducteur'] = df_pred['an'] - df_pred['an_nais']
df_pred['age_conducteur'] = df_pred['age_conducteur'].apply(categorize_age)
df_pred.drop(columns=['an', 'an_nais'], inplace=True)

# 6ème transformation : Combinaison de colonnes 'secu' et 'secu1' et suppression
df_pred['secu_combined'] = df_pred.apply(lambda row: combine_secu(row['secu'], row['secu1']), axis=1)
df_pred.drop(columns=['secu', 'secu1'], inplace=True)

# 7ème transformation : Catégorisation de la colonne 'catv'
df_pred['catv'] = df_pred['catv'].apply(categorize_catv)

# 8ème transformation : Remplissage de la vitesse maximale manquante
df_pred['vma'] = df_pred.apply(fill_and_correct_vma, axis=1)

# 9ème transformation : Traitement de la colonne 'nbv'
df_pred['nbv'] = pd.to_numeric(df_pred['nbv'].astype(str).str.strip(), errors='coerce')
df_pred['nbv'] = df_pred['nbv'].fillna(-1).astype(int)
df_pred['nbv'] = df_pred['nbv'].apply(lambda x: x if x >= 0 else -1)

# 10ème transformation : Remplissage de la colonne 'occutc'
df_pred['occutc'] = df_pred.apply(fill_occutc, axis=1)

# 11ème transformation : Suppression de colonnes spécifiques
df_pred.drop(columns=['lartpc', 'larrout','env1','place'], inplace=True)

# 12ème transformation : Remplissage de valeurs manquantes dans certaines colonnes
cols_to_fill = ['atm', 'col', 'circ', 'vosp', 'prof', 'plan', 'surf', 'infra', 'situ','senc', 'occutc', 'obs', 'obsm', 'choc', 'manv', 'trajet', 'locp', 'etatp', 'actp']
df_pred[cols_to_fill] = df_pred[cols_to_fill].fillna(-1)

# 13ème transformation : Nettoyage de la colonne 'actp'
df_pred['actp'] = df_pred['actp'].apply(clean_actp)

In [112]:
df_pred.head()

Unnamed: 0,Num_Acc,hrmn,lum,agg,int,atm,col,catr,circ,nbv,...,choc,manv,catu,sexe,trajet,locp,actp,etatp,age_conducteur,secu_combined
0,201200049538,2,1,1,1,1.0,4.0,1,1.0,4,...,4.0,2.0,1,2,1.0,0.0,-1,0.0,1,-1
1,201200049538,2,1,1,1,1.0,4.0,1,1.0,4,...,4.0,2.0,1,2,1.0,0.0,-1,0.0,1,-1
2,201200049538,2,1,1,1,1.0,4.0,1,1.0,4,...,4.0,2.0,1,2,2.0,0.0,-1,0.0,1,-1
3,201200049538,2,1,1,1,1.0,4.0,1,1.0,4,...,5.0,2.0,1,2,1.0,0.0,-1,0.0,1,-1
4,201200049538,2,1,1,1,1.0,4.0,1,1.0,4,...,5.0,2.0,1,2,1.0,0.0,-1,0.0,1,-1


In [113]:
max_cols = ['hrmn', 'vma', 'age_conducteur', 'occutc', 'catu','nbv','grav']
mode_cols = ['lum', 'agg', 'int', 'atm', 'col', 'circ', 'vosp', 'prof', 'plan', 'surf', 'infra', 'situ', 'obs', 'obsm', 'manv', 'trajet', 'locp', 'etatp', 'actp']
target_cols = ['catv', 'col','vosp','senc', 'choc', 'secu_combined', 'sexe']

In [114]:
processed_df_test = process_duplicates(df_pred, target_cols=target_cols, max_cols=max_cols, mode_cols=mode_cols)

In [115]:
columns_to_convert = ['hrmn','agg','lum','int','atm','col','catr','circ','vosp','prof','plan','surf','infra','situ',
                      'senc','catv','obs','obsm','choc','manv','catu','sexe','trajet','locp','actp','etatp','vma',
                      'age_conducteur','secu_combined']

df_dummies = pd.get_dummies(df_pred, columns=columns_to_convert, drop_first=False)

In [116]:
X_new = processed_df_test[X_train.columns]

In [117]:
probabilites_grave = gb.predict_proba(X_new)[:, 1]  
probabilites_grave = np.clip(probabilites_grave, 0, 1)  


predictions_df = pd.DataFrame({'Num_Acc': X_new['Num_Acc'], 'GRAVE': probabilites_grave})
predictions_df.to_csv('predictions.csv', index=False)