In [155]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
import os
import joblib

In [156]:
df = pd.read_csv('data/data_cleaned.csv')

# **Encoding**

#### **One Hot Encoding**

In [157]:
oneHotEncoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [158]:
cols = [
    'boite-de-vitesses',
    'type-de-carburant',
    'marque',
    'modele',
    # 'etat',
    'origine'
]

cols_encoded = oneHotEncoder.fit_transform(df[cols])

df_encoded = pd.DataFrame(cols_encoded, columns=oneHotEncoder.get_feature_names_out(cols), index=df.index)

df = df.drop(columns=cols)
df = pd.concat([df, pd.DataFrame(df_encoded)], axis=1)

#### **Ordinal Encoding**

In [159]:
df['etat'].unique()

array(['Excellent', 'Neuf', 'Très bon', 'Bon', 'Correct', 'Pour Pièces',
       'Endommagé'], dtype=object)

In [160]:

# Ordinal encoding with explicit order
etats_ordonnees = [
    'Pour Pièces',
    'Endommagé',
    'Correct',
    'Bon',
    'Très bon',
    'Excellent',
    'Neuf'
]
ordinalEncoder = OrdinalEncoder(categories=[etats_ordonnees], handle_unknown='use_encoded_value', unknown_value=-1)
df['etat'] = ordinalEncoder.fit_transform(df[['etat']])

df['etat'].head

<bound method NDFrame.head of 0        5.0
1        6.0
2        4.0
3        5.0
4        4.0
        ... 
26722    5.0
26723    4.0
26724    4.0
26725    5.0
26726    5.0
Name: etat, Length: 26727, dtype: float64>

In [161]:
boolean_columns = df.select_dtypes(include=['bool']).columns.tolist()

df[boolean_columns] = df[boolean_columns].replace({True: 1, False: 0})
df.head()

  df[boolean_columns] = df[boolean_columns].replace({True: 1, False: 0})


Unnamed: 0,prix,annee-modele,kilometrage,premiere-main,puissance-fiscale,etat,abs,airbags,cd-mp3-bluetooth,camera-de-recul,...,modele_i40,modele_iX,modele_iX3,modele_ix35,modele_ix55,modele_raptor,origine_Dédouanée,origine_Importée neuve,origine_Pas encore dédouanée,origine_WW au Maroc
0,78000,2010,274999.5,0,6,5.0,0,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,170000,2023,27499.5,0,5,6.0,1,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,62000,2005,324999.5,0,8,4.0,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,435000,2020,17499.5,1,8,5.0,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,110000,2015,184999.5,0,6,4.0,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# **Standarization**

In [162]:
# Colonnes à standardiser
cols_to_standardize = [
    'annee-modele',
    'kilometrage',
    'puissance-fiscale'
]

scaler = StandardScaler()
df[cols_to_standardize] = scaler.fit_transform(df[cols_to_standardize])

In [163]:
# df.columns.to_list()

In [164]:
# df.describe()

In [165]:
df.head()

Unnamed: 0,prix,annee-modele,kilometrage,premiere-main,puissance-fiscale,etat,abs,airbags,cd-mp3-bluetooth,camera-de-recul,...,modele_i40,modele_iX,modele_iX3,modele_ix35,modele_ix55,modele_raptor,origine_Dédouanée,origine_Importée neuve,origine_Pas encore dédouanée,origine_WW au Maroc
0,78000,-0.279183,1.424524,0,-0.540718,5.0,0,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,170000,1.264825,-1.146379,0,-0.896805,6.0,1,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,62000,-0.873032,1.943898,0,0.171455,4.0,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,435000,0.908516,-1.250253,1,0.171455,5.0,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,110000,0.314666,0.48965,0,-0.540718,4.0,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [166]:
df.to_csv('data/data_prepared.csv', index=False)