In [208]:
# import libraries
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [209]:
# import dataset
data = pd.read_csv('dataAutOcaz.csv', sep=';', index_col=0)

In [210]:
data.head()


Unnamed: 0,Marque,Année,Code Postal,Carburant,Emission Co2,Transmission,nbPortes,nbPlace,Kilométrage,Prix
0,OPEL,2017,31,Diesel,110-130,Manuelle,5,5,119000,13900
1,VOLKSWAGEN,2011,55,Diesel,110-130,Manuelle,5,5,72330,13490
2,AUDI,2016,24,Diesel,110-130,Manuelle,5,5,114100,17990
3,OPEL,2016,66,Diesel,110-130,Manuelle,5,5,121019,10990
4,HONDA,2018,69,Diesel,110-130,Manuelle,5,5,88653,21300


In [211]:
# modifier la valeur 110-130 par 130 et 130-180 par 180 dans la colonne EmissionCo2 
data.loc[data['Emission Co2'] == '110-130', 'Emission Co2'] = '130'
data.loc[data['Emission Co2'] == '130-180', 'Emission Co2'] = '180'

In [212]:
# melanger les données
data = data.sample(frac=1)
data.head()

Unnamed: 0,Marque,Année,Code Postal,Carburant,Emission Co2,Transmission,nbPortes,nbPlace,Kilométrage,Prix
5658,RENAULT,2021,31,Diesel,130,Manuelle,5,5,15960,20140
14074,SEAT,2019,93,Essence,130,Manuelle,5,5,23575,21730
18848,VOLVO,2022,74,Electric,110,Automatique,5,5,1403,29070
4840,RENAULT,2015,51,Diesel,130,Manuelle,5,5,151760,9620
8647,OPEL,2018,68,Essence,130,Manuelle,5,5,119751,14350


In [213]:
# affichage des Marque par rapport au nbPortes
nbPortesMarque = data.groupby('nbPortes').count()['Marque']
nbPortesMarque.sort_values(ascending=False).head(10)

nbPortes
5    22480
3      582
Name: Marque, dtype: int64

In [214]:
# affichage des Marque par rapport au Carburant
CarburantMarque = data.groupby('Carburant').count()['Marque']
CarburantMarque.sort_values(ascending=False).head(10)

Carburant
Diesel      10166
Essence      9248
Electric     3648
Name: Marque, dtype: int64

In [215]:
# affichage des Marque par rapport au nbPlace
nbPlaceMarque = data.groupby('nbPlace').count()['Marque']
nbPlaceMarque.sort_values(ascending=False).head(10)

nbPlace
5    22480
4      582
Name: Marque, dtype: int64

In [216]:
# afficher le prix moyen par Marque
prixMoyen = data.groupby('Marque').mean()['Prix']
prixMoyen.sort_values(ascending=False).head(10)

Marque
CUPRA         36774.000000
SANTANA       27600.000000
LAND          26160.858974
PORSCHE       25922.000000
VOLVO         23761.785714
JAGUAR        23253.131579
BMW           23049.854025
SERES         22590.909091
DS            22589.859813
VOLKSWAGEN    22537.204096
Name: Prix, dtype: float64

In [217]:
# changer les données de Transmission Manuelle par 0 et Automatique par 1
data.loc[data['Transmission'] == 'Manuelle', 'Transmission'] = 0
data.loc[data['Transmission'] == 'Automatique', 'Transmission'] = 1


In [218]:
# changer les données Carburant Diesel par 0 et  essence par 1 et Electrique par 2
data.loc[data['Carburant'] == 'Diesel', 'Carburant'] = 0
data.loc[data['Carburant'] == 'Essence', 'Carburant'] = 1
data.loc[data['Carburant'] == 'Electric', 'Carburant'] = 2

In [219]:
data.head()

Unnamed: 0,Marque,Année,Code Postal,Carburant,Emission Co2,Transmission,nbPortes,nbPlace,Kilométrage,Prix
5658,RENAULT,2021,31,0,130,0,5,5,15960,20140
14074,SEAT,2019,93,1,130,0,5,5,23575,21730
18848,VOLVO,2022,74,2,110,1,5,5,1403,29070
4840,RENAULT,2015,51,0,130,0,5,5,151760,9620
8647,OPEL,2018,68,1,130,0,5,5,119751,14350


In [220]:
# encoder la colonne Marque 
data['Marque'] = data['Marque'].astype('category')
data.Marque = data.Marque.cat.codes


In [221]:
data.head()

Unnamed: 0,Marque,Année,Code Postal,Carburant,Emission Co2,Transmission,nbPortes,nbPlace,Kilométrage,Prix
5658,30,2021,31,0,130,0,5,5,15960,20140
14074,32,2019,93,1,130,0,5,5,23575,21730
18848,41,2022,74,2,110,1,5,5,1403,29070
4840,30,2015,51,0,130,0,5,5,151760,9620
8647,27,2018,68,1,130,0,5,5,119751,14350


In [222]:
# definir les variables d'entrée toutes les colonnes sauf la colonne prix 
X = data.drop('Prix', axis=1)

#definir la colonne de prix comme variable de sortie
y = data['Prix']

print (X.shape)
print (y.shape)


(23062, 9)
(23062,)


In [223]:
# definir les features et le label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print (X_train.shape)
print (X_test.shape)
print (y_train.shape)
print (y_test.shape)

(18449, 9)
(4613, 9)
(18449,)
(4613,)


In [224]:
# definir le scaler
scaler = StandardScaler()


In [225]:
# appliquer le scaler sur les features
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [226]:
# definir le modele SVR
model = SVR(kernel='rbf', C=1e3, gamma=0.1)


In [227]:
# entrainer le modele
model.fit(X_train, y_train)


SVR(C=1000.0, gamma=0.1)

In [228]:
# predire les prix
y_pred = model.predict(X_test)


In [229]:
# afficher le score
print (model.score(X_test, y_test))

0.3806594328089298
