## **Imports**

In [62]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
import os
import joblib
from sklearn.model_selection import train_test_split

## **Loading Data** 

In [63]:
df = pd.read_csv('data/data_cleaned.csv')

## **Data Spliting**

In [64]:
X = df.drop(columns='prix')
y = df['prix']

X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(X_train.shape)
print(y.shape)

(22597, 27)
(32282,)


## **Encoding**

#### **One Hot Encoding**

In [65]:
oneHotEncoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

cols = [
    'boite-de-vitesses',
    'type-de-carburant',
    'marque',
    'modele',
    'origine'
]

X_train_cols_encoded = oneHotEncoder.fit_transform(X_train[cols])
X_test_cols_encoded = oneHotEncoder.transform(X_test[cols])

X_train_encoded = pd.DataFrame(
    X_train_cols_encoded,
    columns=oneHotEncoder.get_feature_names_out(cols),
    index=X_train.index
)

X_test_encoded = pd.DataFrame(
    X_test_cols_encoded,
    columns=oneHotEncoder.get_feature_names_out(cols),
    index=X_test.index
)

X_train = X_train.drop(columns=cols)
X_test = X_test.drop(columns=cols)

X_train = pd.concat([X_train, X_train_encoded], axis=1)
X_test = pd.concat([X_test, X_test_encoded], axis=1)

#### **Ordinal Encoding**

In [66]:
X_train['etat'].unique()

array(['Très bon', 'Excellent', 'Bon', 'Neuf', 'Correct', 'Endommagé',
       'Pour Pièces'], dtype=object)

In [67]:
# Ordinal encoding with explicit order
etats_ordonnees = [
    'Pour Pièces',
    'Endommagé',
    'Correct',
    'Bon',
    'Très bon',
    'Excellent',
    'Neuf'
]

ordinalEncoder = OrdinalEncoder(categories=[etats_ordonnees], handle_unknown='use_encoded_value', unknown_value=-1)

X_train['etat'] = ordinalEncoder.fit_transform(X_train[['etat']])
X_test['etat'] = ordinalEncoder.transform(X_test[['etat']])

X_test['etat'].head

<bound method NDFrame.head of 24851    3.0
14615    3.0
26759    4.0
12272    4.0
14412    3.0
        ... 
9390     5.0
26803    5.0
4660     5.0
6300     5.0
695      4.0
Name: etat, Length: 9685, dtype: float64>

In [68]:
# boolean_columns = X_train.select_dtypes(include=['bool']).columns.tolist()
# X_train[boolean_columns] = X_train[boolean_columns].replace({True: 1, False: 0})

# X_train.head()

## **Standarization**

In [69]:
scaler = StandardScaler()

cols_to_standardize = [
    'kilometrage',
    'puissance-fiscale',
    'age',
]

X_train[cols_to_standardize] = scaler.fit_transform(X_train[cols_to_standardize])
X_test[cols_to_standardize] = scaler.transform(X_test[cols_to_standardize])

## **Saving**

In [70]:
X_train.to_csv('data/X_train.csv', index=False)
X_test.to_csv('data/X_test.csv', index=False)
y_train.to_csv('data/y_train.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)

In [72]:
model_dir = 'models'
os.makedirs(model_dir, exist_ok=True)
joblib.dump(ordinalEncoder, 'models/ordinal_encoder.pkl')
joblib.dump(oneHotEncoder, 'models/one_hot_encoder.pkl')
joblib.dump(scaler, 'models/scaler.pkl')

['models/scaler.pkl']