In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
from pathlib import Path
import sklearn

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble.forest import RandomForestRegressor



In [3]:
df = pd.read_csv('cars.csv', index_col='Unnamed: 0')
df.head()

Unnamed: 0,id,price,is_new,milage,gearbox,car_make,body_type,car_model,condition,make_year,body_color,drivetrain,engine_type,horse_power,number_owner,configuration,is_right_steering_wheel
0,1767681,727900.0,True,0.0,manual,Лада,,Х-рей,new,2021.0,белый,FWD,gasoline,106.0,,1.6 МТ Classic + пакет Air Conditioner,False
1,2502633,889000.0,False,93000.0,automatic,Honda,station_wagon,Shuttle,average,2015.0,,FWD,hybrid,110.0,,1.5 Hybrid X,True
2,1970294,755000.0,False,81000.0,manual,Toyota,sedan,Corolla,average,2014.0,белый,FWD,gasoline,122.0,2.0,,False
3,1970418,293000.0,False,200000.0,manual,Лада,sedan,Гранта,average,2014.0,белый,FWD,gasoline,87.0,3.0,,False
4,1970292,585000.0,False,91485.0,manual,Kia,sedan,Rio,average,2015.0,,FWD,gasoline,107.0,2.0,,False


In [4]:
# Смотрим информация относительно типов данных и кол-ва данных по каждому столбцу
# на основе этой информации буду решать что делать с данными
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 259407 entries, 0 to 281522
Data columns (total 17 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       259407 non-null  int64  
 1   price                    259407 non-null  float64
 2   is_new                   259407 non-null  bool   
 3   milage                   259407 non-null  float64
 4   gearbox                  259407 non-null  object 
 5   car_make                 259407 non-null  object 
 6   body_type                122635 non-null  object 
 7   car_model                259407 non-null  object 
 8   condition                259127 non-null  object 
 9   make_year                259407 non-null  float64
 10  body_color               239758 non-null  object 
 11  drivetrain               255273 non-null  object 
 12  engine_type              255919 non-null  object 
 13  horse_power              250371 non-null  float64
 14  numb

In [5]:
# Смотрим кореляцию данных на цену автомобиля 
df.corr(method = 'pearson')['price']

id                         0.124028
price                      1.000000
is_new                     0.376311
milage                    -0.281210
make_year                  0.405711
horse_power                0.542885
number_owner              -0.213261
is_right_steering_wheel   -0.154051
Name: price, dtype: float64

In [6]:
# Смотрим кол-во уникальных элементов по каждому столбцу и тип данных столбца
for i in df:
    print(i,'------',len(df[i].unique()), '------', np.dtype(df[i]))

id ------ 259407 ------ int64
price ------ 13478 ------ float64
is_new ------ 2 ------ bool
milage ------ 36372 ------ float64
gearbox ------ 2 ------ object
car_make ------ 111 ------ object
body_type ------ 12 ------ object
car_model ------ 1731 ------ object
condition ------ 4 ------ object
make_year ------ 81 ------ float64
body_color ------ 17 ------ object
drivetrain ------ 4 ------ object
engine_type ------ 7 ------ object
horse_power ------ 493 ------ float64
number_owner ------ 37 ------ float64
configuration ------ 15058 ------ object
is_right_steering_wheel ------ 2 ------ bool


In [7]:
# Т.к солбец configration имеет > 100000 nan данных и 15000 уникальных
# было решено удалить данный столбец так же как и столбец id который имеет низкую
# корреляцию и все элементы отличны друг от друга

df = df.drop(['configuration', 'id'], axis = 1)

In [8]:
# Создаю словарь в котором буду хранить данные вида 
# Имя элемента - уникальное значение 

dict_cars = {}

# Функция переводит объекты в числовые значения (float) и сохраняет эти данные в словаре 
# и обновляет данные в df
def add_in_dict_cars(name):
    print(f'Loading of {name} column', end = '----')
    dict_cars[name] = {}
    counter = 0
    amout_unique = len(df[name].unique())
    while counter != amout_unique:
        dict_cars[name][df[name].unique()[counter]] = float(counter)
        counter += 1
        if counter == amout_unique:
            print('Ready!')
    df[name] = df[name].map(dict_cars[name])
    
for i in df:
    if np.dtype(df[i]) not in [int, float]:
        add_in_dict_cars(i)
df.head()


Loading of is_new column----Ready!
Loading of gearbox column----Ready!
Loading of car_make column----Ready!
Loading of body_type column----Ready!
Loading of car_model column----Ready!
Loading of condition column----Ready!
Loading of body_color column----Ready!
Loading of drivetrain column----Ready!
Loading of engine_type column----Ready!
Loading of is_right_steering_wheel column----Ready!


Unnamed: 0,price,is_new,milage,gearbox,car_make,body_type,car_model,condition,make_year,body_color,drivetrain,engine_type,horse_power,number_owner,is_right_steering_wheel
0,727900.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021.0,0.0,0.0,0.0,106.0,,0.0
1,889000.0,1.0,93000.0,1.0,1.0,1.0,1.0,1.0,2015.0,1.0,0.0,1.0,110.0,,1.0
2,755000.0,1.0,81000.0,0.0,2.0,2.0,2.0,1.0,2014.0,0.0,0.0,0.0,122.0,2.0,0.0
3,293000.0,1.0,200000.0,0.0,0.0,2.0,3.0,1.0,2014.0,0.0,0.0,0.0,87.0,3.0,0.0
4,585000.0,1.0,91485.0,0.0,3.0,2.0,4.0,1.0,2015.0,1.0,0.0,0.0,107.0,2.0,0.0


In [9]:
# Бросаем взор на кол-во nan значений в каждом столбце

for i in df:
    print(i, '-----', len(df[df[i].isna()]))

price ----- 0
is_new ----- 0
milage ----- 0
gearbox ----- 0
car_make ----- 0
body_type ----- 0
car_model ----- 0
condition ----- 0
make_year ----- 0
body_color ----- 0
drivetrain ----- 0
engine_type ----- 0
horse_power ----- 9036
number_owner ----- 110365
is_right_steering_wheel ----- 0


In [10]:
# Есть 2 варианта: 1-ый это заменить все nan значения на среднее с помощью fillna
# 2-ой -выбросить из набора весь столбец, я выбрал 2ой вариант

df = df.drop(['number_owner'], axis = 1)
df = df.dropna()


df.corr()['price']

price                      1.000000
is_new                    -0.377134
milage                    -0.286145
gearbox                    0.290341
car_make                   0.144737
body_type                 -0.026795
car_model                  0.074794
condition                 -0.364771
make_year                  0.405732
body_color                -0.010552
drivetrain                 0.169306
engine_type                0.137948
horse_power                0.542885
is_right_steering_wheel   -0.154283
Name: price, dtype: float64

In [11]:
# Делим наш датасет на метки и тренировочные данные

np.random.seed(42)
columns = df.columns
columns = columns.drop(['price'])
x = df[columns]
y = df['price']

In [12]:
# Делим наши данные для контрольную и тренировочную выборки в соотношении 
# 80% (тренировочные) к 20% (тестовые) и перемешиваем их 

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, shuffle = True)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((200296, 13), (50075, 13), (200296,), (50075,))

In [13]:
# Создаем наши модели и сохраняем их внутри словаря

ESTIMATORS = {'RandomForest': RandomForestRegressor(),'ExtraTree': ExtraTreeRegressor() ,'TreeReg':DecisionTreeRegressor() ,'Neighb':KNeighborsRegressor() }

In [14]:
# Обучаем модели и обученные модели сохраняем в словарь model_predict 

model_predict = {}
for i in ESTIMATORS:
    model_predict[i] = ESTIMATORS[i].fit(x_train, y_train)

In [15]:
# Создаем pandas- таблицу в которой будут храниться предсказанные значения и 
# целевые метки

df_pred = pd.DataFrame({'TreeRegressor': model_predict['TreeReg'].predict(x_test), 
                        'RandomForest': model_predict['RandomForest'].predict(x_test),
                       'Neighb': model_predict['Neighb'].predict(x_test),
                        'ExtraTree': model_predict['ExtraTree'].predict(x_test),
                       'goal': y_test})    
df_pred = df_pred.astype(int)
df_pred

Unnamed: 0,TreeRegressor,RandomForest,Neighb,ExtraTree,goal
23701,680000,709876,626000,720000,750000
208016,130000,143591,153600,155555,139000
13144,2250000,2132300,2342000,2250000,1640000
181918,460000,525259,371400,500000,450000
221739,139666,140004,135800,139666,135000
...,...,...,...,...,...
224865,1071511,1071569,1123538,1071511,987900
54751,480000,446920,849000,750000,285000
191319,190000,237350,258000,190000,300000
53234,580000,568035,847800,580000,600000


In [16]:
# Находим ошибку по каждой модели и сохраняем их все в словарь mse

mse = {}
for i in df_pred:
    if i != 'goal':
        mse[i] = np.abs(df_pred[i].sum() - df_pred['goal'].sum())


In [17]:
# Находим наилучшиую модель

count = 0
for i in mse:
    if count == 0:
        min_err = mse[i]
        name_of_est = i
        count += 1
    else:
        count +=1 
        if mse[i] < min_err:
            min_err = mse[i]
            name_of_est = i
print(f'The best estimator is {name_of_est} with mse = {min_err}')
    

The best estimator is ExtraTree with mse = 57556417
