<a href="https://colab.research.google.com/github/alxkzncoff/skillfactory_rds6_car_price_prediction_ml_dl/blob/main/model/cat_boost_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# INSTALL

In [3]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/52/39/128fff65072c8327371e3c594f3c826d29c85b21cb6485980353b168e0e4/catboost-0.24.2-cp36-none-manylinux1_x86_64.whl (66.1MB)
[K     |████████████████████████████████| 66.2MB 57kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.2


# DEFINE

In [5]:
# DEFINE
DATA_PATH                   = '/content/drive/My Drive/skill_factory_car_price_prediction_ml_dl'
RANDOM_SEED                 = 42

# IMPORT

In [89]:
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 50)

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from catboost import CatBoostRegressor
from catboost import Pool
from catboost import cv
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [90]:
train = pd.read_csv(DATA_PATH+'/clear_train.csv')
test = pd.read_csv(DATA_PATH+'/clear_test.csv')

In [91]:
train.model_date = train.model_date.astype(str)
test.model_date = test.model_date.astype(str)

train.production_date = train.production_date.astype(str)
test.production_date = test.production_date.astype(str)

train.num_of_doors = train.num_of_doors.astype(str)
test.num_of_doors = test.num_of_doors.astype(str)

In [92]:
display(train.info())
display(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6682 entries, 0 to 6681
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   body                 6682 non-null   object 
 1   brand                6682 non-null   object 
 2   color                6682 non-null   object 
 3   description          6682 non-null   object 
 4   engine_displacement  6682 non-null   float64
 5   engine_power         6682 non-null   float64
 6   fuel                 6682 non-null   object 
 7   mileage              6682 non-null   int64  
 8   model_date           6682 non-null   object 
 9   model_info           6682 non-null   object 
 10  name                 6682 non-null   object 
 11  num_of_doors         6682 non-null   object 
 12  price                6682 non-null   float64
 13  production_date      6682 non-null   object 
 14  transmission         6682 non-null   object 
 15  owners               6682 non-null   o

None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1671 entries, 0 to 1670
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   body                 1671 non-null   object 
 1   brand                1671 non-null   object 
 2   color                1671 non-null   object 
 3   description          1671 non-null   object 
 4   engine_displacement  1671 non-null   float64
 5   engine_power         1671 non-null   float64
 6   fuel                 1671 non-null   object 
 7   mileage              1671 non-null   int64  
 8   model_date           1671 non-null   object 
 9   model_info           1671 non-null   object 
 10  name                 1671 non-null   object 
 11  num_of_doors         1671 non-null   object 
 12  production_date      1671 non-null   object 
 13  transmission         1671 non-null   object 
 14  owners               1671 non-null   object 
 15  vehicle_title        1671 non-null   o

None

# DATA PREPARING

In [93]:
MMS = MinMaxScaler()
MMS_price = MinMaxScaler()
STDS = StandardScaler()

In [94]:
MMS.fit(train[['engine_displacement', 'engine_power', 'mileage', 'tax', 'days_in_use']])

MMS_price.fit(train[['price']])

MinMaxScaler(copy=True, feature_range=(0, 1))

In [95]:
train[['engine_displacement', 'engine_power', 'mileage', 'tax', 'days_in_use']] = MMS.transform(train[['engine_displacement', 'engine_power', 'mileage', 'tax', 'days_in_use']])

train[['price']] = MMS_price.transform(train[['price']])

display(train[['engine_displacement', 'engine_power', 'mileage', 'tax', 'days_in_use']])

display(train[['price']])

Unnamed: 0,engine_displacement,engine_power,mileage,tax,days_in_use
0,0.400000,0.349291,0.244999,0.420221,0.342100
1,0.366667,0.228723,0.182999,0.130174,0.236833
2,0.483333,0.409574,0.122732,0.473934,0.184163
3,0.233333,0.186170,0.150999,0.085308,0.236833
4,0.200000,0.150709,0.139999,0.066351,0.236833
...,...,...,...,...,...
6677,0.233333,0.132979,0.232776,0.045814,0.368398
6678,0.233333,0.203901,0.033699,0.090574,0.105267
6679,0.233333,0.120567,0.199999,0.043233,0.315801
6680,0.400000,0.416667,0.071999,0.480253,0.105267


Unnamed: 0,price
0,0.012615
1,0.023251
2,0.043379
3,0.021768
4,0.021556
...,...
6677,0.010539
6678,0.067745
6679,0.009691
6680,0.070711


# SPLIT

In [96]:
X = train.drop(columns=['price', 'description'])
y = train['price']

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.15, shuffle=True, random_state=RANDOM_SEED)

In [98]:
train_data = Pool(data = X_train,
                  label = y_train,
                  cat_features =  ['body', 'brand', 'color', 'fuel', 'model_date', 'model_info', 'name', 'num_of_doors',
                                   'production_date', 'transmission', 'owners', 'vehicle_title', 'drive_type'])

test_data = Pool(data = X_test,
                 label = y_test,
                  cat_features =  ['body', 'brand', 'color', 'fuel', 'model_date', 'model_info', 'name', 'num_of_doors',
                                   'production_date', 'transmission', 'owners', 'vehicle_title', 'drive_type'])

# MODEL

In [99]:
ctb = CatBoostRegressor(loss_function = 'MAE',
                         eval_metric = 'MAPE',
                         learning_rate=0.005,
                         iterations=5500,
                         l2_leaf_reg=2,
                         depth=6,
                         bootstrap_type = 'Bayesian', # Bayesian Bernoulli
#                          subsample = 0.8,
                         one_hot_max_size = 5,
                         random_seed=42)
                        #  od_type='Iter',
                        #  od_wait=500)

model = ctb.fit(train_data,
                eval_set=test_data,
                verbose_eval=1000,
                use_best_model=True,
                plot=True)
print(model)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.0461388	test: 0.0451746	best: 0.0451746 (0)	total: 30.5ms	remaining: 2m 47s
1000:	learn: 0.0141315	test: 0.0139859	best: 0.0139859 (1000)	total: 11.4s	remaining: 51.3s
2000:	learn: 0.0117965	test: 0.0120971	best: 0.0120971 (2000)	total: 23.8s	remaining: 41.7s
3000:	learn: 0.0106323	test: 0.0112983	best: 0.0112983 (3000)	total: 37.3s	remaining: 31s
4000:	learn: 0.0098501	test: 0.0108602	best: 0.0108602 (4000)	total: 52.2s	remaining: 19.6s
5000:	learn: 0.0092981	test: 0.0105816	best: 0.0105816 (5000)	total: 1m 8s	remaining: 6.79s
5499:	learn: 0.0090808	test: 0.0104962	best: 0.0104962 (5499)	total: 1m 16s	remaining: 0us

bestTest = 0.01049615658
bestIteration = 5499

<catboost.core.CatBoostRegressor object at 0x7fc7665f2470>


# PREDICTION

In [100]:
test.drop(columns=['description'], inplace=True)

In [102]:
MMS.fit(test[['engine_displacement', 'engine_power', 'mileage', 'tax', 'days_in_use']])

MinMaxScaler(copy=True, feature_range=(0, 1))

In [104]:
test[['engine_displacement', 'engine_power', 'mileage', 'tax', 'days_in_use']] = MMS.transform(test[['engine_displacement', 'engine_power', 'mileage', 'tax', 'days_in_use']])

test[['engine_displacement', 'engine_power', 'mileage', 'tax', 'days_in_use']]

Unnamed: 0,engine_power,mileage
0,0.198664,0.349999
1,0.353923,0.014999
2,0.195326,0.166599
3,0.058431,0.146999
4,0.148581,0.023999
...,...,...
1666,0.138564,0.075999
1667,0.195326,0.120380
1668,0.081803,0.025789
1669,0.138564,0.103999


In [105]:
predict_data = Pool(data = test,
                    cat_features =  ['body', 'brand', 'color', 'fuel', 'model_date', 'model_info', 'name', 'num_of_doors',
                                     'production_date', 'transmission', 'owners', 'vehicle_title', 'drive_type'])

In [106]:
predict_submission = MMS_price.inverse_transform(model.predict(predict_data).reshape(-1,1))
predict_submission

array([[ 377440.91580779],
       [2757598.81276056],
       [ 597817.45074044],
       ...,
       [1440895.04933164],
       [1061183.60154778],
       [1004007.55153357]])

# MAKE SUBMISSION


In [107]:
sample_submission = pd.read_csv('/content/drive/My Drive/skill_factory_car_price_prediction_ml_dl/sample_submission.csv')

In [108]:
# Результат как есть
sample_submission['price'] = predict_submission
sample_submission.to_csv('/content/drive/My Drive/skill_factory_car_price_prediction_ml_dl/sub.csv', index=False)
display(sample_submission.head(10)) # Пока лучший результат

# Результат с округлением
sample_submission['price'] = np.round(predict_submission, -4)
sample_submission.to_csv('/content/drive/My Drive/skill_factory_car_price_prediction_ml_dl/sub_round.csv', index=False)
display(sample_submission.head(10))

# Результат с округлением и умножением на коэффициент 0.94
sample_submission['price'] = np.round(predict_submission, -4)*0.94
sample_submission.to_csv('/content/drive/My Drive/skill_factory_car_price_prediction_ml_dl/sub_round_coef.csv', index=False)
display(sample_submission.head(10))

Unnamed: 0,sell_id,price
0,1099427284,377440.9
1,1096405886,2757599.0
2,1100195294,597817.5
3,1099827282,481827.3
4,1100076198,2458841.0
5,1090159352,2760880.0
6,1098987386,1341918.0
7,1098639040,1323472.0
8,1099933039,989264.0
9,1099565572,487760.4


Unnamed: 0,sell_id,price
0,1099427284,380000.0
1,1096405886,2760000.0
2,1100195294,600000.0
3,1099827282,480000.0
4,1100076198,2460000.0
5,1090159352,2760000.0
6,1098987386,1340000.0
7,1098639040,1320000.0
8,1099933039,990000.0
9,1099565572,490000.0


Unnamed: 0,sell_id,price
0,1099427284,357200.0
1,1096405886,2594400.0
2,1100195294,564000.0
3,1099827282,451200.0
4,1100076198,2312400.0
5,1090159352,2594400.0
6,1098987386,1259600.0
7,1098639040,1240800.0
8,1099933039,930600.0
9,1099565572,460600.0
