In [41]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
import os
import joblib

In [42]:
df = pd.read_csv('data/data_cleaned.csv')

# **Encoding**

#### **One Hot Encoding**

In [43]:
oneHotEncoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [44]:
cols = [
    'boite-de-vitesses',
    'type-de-carburant',
    'marque',
    'modele',
    'origine'
]

cols_encoded = oneHotEncoder.fit_transform(df[cols])

df_encoded = pd.DataFrame(cols_encoded, columns=oneHotEncoder.get_feature_names_out(cols), index=df.index)

df = df.drop(columns=cols)
df = pd.concat([df, pd.DataFrame(df_encoded)], axis=1)

#### **Ordinal Encoding**

In [45]:
df['etat'].unique()

array(['Excellent', 'Neuf', 'Très bon', 'Bon', 'Correct', 'Pour Pièces',
       'Endommagé'], dtype=object)

In [46]:
# Ordinal encoding with explicit order
etats_ordonnees = [
    'Pour Pièces',
    'Endommagé',
    'Correct',
    'Bon',
    'Très bon',
    'Excellent',
    'Neuf'
]
ordinalEncoder = OrdinalEncoder(categories=[etats_ordonnees], handle_unknown='use_encoded_value', unknown_value=-1)
df['etat'] = ordinalEncoder.fit_transform(df[['etat']])

df['etat'].head

<bound method NDFrame.head of 0        5.0
1        6.0
2        4.0
3        5.0
4        4.0
        ... 
26722    5.0
26723    4.0
26724    4.0
26725    5.0
26726    5.0
Name: etat, Length: 26727, dtype: float64>

In [47]:
# boolean_columns = df.select_dtypes(include=['bool']).columns.tolist()
# df[boolean_columns] = df[boolean_columns].replace({True: 1, False: 0})

# df.head()

# **Standarization**

In [48]:
# Colonnes à standardiser
cols_to_standardize = [
    'kilometrage',
    'puissance-fiscale',
    'age',
]

scaler = StandardScaler()
df[cols_to_standardize] = scaler.fit_transform(df[cols_to_standardize])

In [49]:
# df.columns.to_list()

In [50]:
# df.describe()

In [51]:
df.head()

Unnamed: 0,prix,kilometrage,premiere-main,puissance-fiscale,etat,abs,airbags,cd-mp3-bluetooth,camera-de-recul,climatisation,...,modele_i40,modele_iX,modele_iX3,modele_ix35,modele_ix55,modele_raptor,origine_Dédouanée,origine_Importée neuve,origine_Pas encore dédouanée,origine_WW au Maroc
0,78000,1.424524,False,-0.540718,5.0,False,True,True,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,170000,-1.146379,False,-0.896805,6.0,True,True,True,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,62000,1.943898,False,0.171455,4.0,True,True,True,True,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,435000,-1.250253,True,0.171455,5.0,True,True,True,True,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,110000,0.48965,False,-0.540718,4.0,True,True,True,True,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [52]:
df.to_csv('data/data_prepared.csv', index=False)

In [53]:
df

Unnamed: 0,prix,kilometrage,premiere-main,puissance-fiscale,etat,abs,airbags,cd-mp3-bluetooth,camera-de-recul,climatisation,...,modele_i40,modele_iX,modele_iX3,modele_ix35,modele_ix55,modele_raptor,origine_Dédouanée,origine_Importée neuve,origine_Pas encore dédouanée,origine_WW au Maroc
0,78000,1.424524,False,-0.540718,5.0,False,True,True,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,170000,-1.146379,False,-0.896805,6.0,True,True,True,False,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,62000,1.943898,False,0.171455,4.0,True,True,True,True,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,435000,-1.250253,True,0.171455,5.0,True,True,True,True,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,110000,0.489650,False,-0.540718,4.0,True,True,True,True,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26722,365000,0.178026,True,0.883628,5.0,True,True,True,True,True,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
26723,105000,1.424524,False,-0.540718,4.0,False,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
26724,68000,0.593525,False,0.171455,4.0,True,True,True,True,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
26725,155000,-0.029724,False,-0.540718,5.0,False,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
