In [126]:
# Import the necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler, Normalizer

In [127]:
#load the data
data = pd.read_csv('autoscout24.csv', sep=';')

In [128]:
data.head()

Unnamed: 0,mileage,make,model,fuel,gear,offerType,price,hp,year of sale
0,235000,BMW,316,Diesel,Manual,Used,6800,116.0,2011
1,92800,Volkswagen,Golf,Gasoline,Manual,Used,6877,122.0,2011
2,149300,SEAT,Exeo,Gasoline,Manual,Used,6900,160.0,2011
3,96200,Renault,Megane,Gasoline,Manual,Used,6950,110.0,2011
4,156000,Peugeot,308,Gasoline,Manual,Used,6950,156.0,2011


In [129]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46405 entries, 0 to 46404
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mileage       46405 non-null  int64  
 1   make          46405 non-null  object 
 2   model         46262 non-null  object 
 3   fuel          46405 non-null  object 
 4   gear          46223 non-null  object 
 5   offerType     46405 non-null  object 
 6   price         46405 non-null  int64  
 7   hp            46376 non-null  float64
 8   year of sale  46405 non-null  int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 3.2+ MB


In [130]:
sum_of_all_null_values = data.isnull().sum()[data.isnull().sum() > 0].sum()
print(f'The database has {sum_of_all_null_values} null values')
print(f'This correspond to {sum_of_all_null_values/data.shape[0]*100:.2f}% of the data -> we can drop them for further analysis')

The database has 354 null values
This correspond to 0.76% of the data -> we can drop them for further analysis


In [131]:
#drup the null values
data = data.dropna()

In [132]:
data.describe()

Unnamed: 0,mileage,price,hp,year of sale
count,46071.0,46071.0,46071.0,46071.0
mean,71233.14,16563.12,132.992663,2016.012155
std,62454.13,19279.6,75.074689,3.155555
min,0.0,1100.0,1.0,2011.0
25%,19900.0,7490.0,86.0,2013.0
50%,60000.0,11000.0,116.0,2016.0
75%,105000.0,19490.0,150.0,2019.0
max,1111111.0,1199900.0,850.0,2021.0


In [133]:
data.columns

Index(['mileage', 'make', 'model', 'fuel', 'gear', 'offerType', 'price', 'hp',
       'year of sale'],
      dtype='object')

In [134]:
dummy_data = pd.get_dummies(data=data,drop_first=True, columns=["make","model","fuel","gear","offerType"])


In [135]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dummy_data[['mileage','hp','year of sale']])
data_scaled = pd.DataFrame(data_scaled, columns=['mileage','hp','year of sale'])

In [136]:
dummy_data.drop(columns=["mileage","hp","year of sale"],inplace=True)

In [137]:
#concatenate the scaled data with the rest of the data
data = pd.concat([data_scaled, dummy_data], axis=1)

In [138]:
data.head()

Unnamed: 0,mileage,hp,year of sale,price,make_Aixam,make_Alfa,make_Alpina,make_Alpine,make_Aston,make_Audi,...,fuel_Gasoline,fuel_Hydrogen,fuel_LPG,fuel_Others,gear_Manual,gear_Semi-automatic,offerType_Employee's car,offerType_New,offerType_Pre-registered,offerType_Used
0,2.622223,-0.226346,-1.588377,6800.0,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
1,0.345327,-0.146425,-1.588377,6877.0,False,False,False,False,False,False,...,True,False,False,False,True,False,False,False,False,True
2,1.250001,0.359743,-1.588377,6900.0,False,False,False,False,False,False,...,True,False,False,False,True,False,False,False,False,True
3,0.399768,-0.306267,-1.588377,6950.0,False,False,False,False,False,False,...,True,False,False,False,True,False,False,False,False,True
4,1.357281,0.306463,-1.588377,6950.0,False,False,False,False,False,False,...,True,False,False,False,True,False,False,False,False,True


In [141]:
data.describe()

Unnamed: 0,mileage,hp,year of sale,price
count,46071.0,46071.0,46071.0,46071.0
mean,1.184469e-16,-1.480587e-17,-2.193242e-14,16563.12
std,1.000011,1.000011,1.000011,19279.6
min,-1.14058,-1.75817,-1.588377,1100.0
25%,-0.8219423,-0.6259523,-0.9545668,7490.0
50%,-0.1798641,-0.2263459,-0.003852028,11000.0
75%,0.5406725,0.2265413,0.9468628,19490.0
max,16.65045,9.55069,1.580673,1199900.0
