## About Dataset

### Scraped data of used cars listings. 100,000 listings, which have been separated into files corresponding to each car manufacturer (total 13 csv files).

### The data set contains information of price, transmission, mileage, fuel type, road tax, miles per gallon (mpg), and engine size (in liters).

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)



## Read the Datasets

In [2]:
df_audi = pd.read_csv(r"C:\Users\Berat Arslan\PycharmProjects\pythonProject1\DSMLBC8\donem_odevı\datasets\audi.csv")
df_bmw = pd.read_csv(r"C:\Users\Berat Arslan\PycharmProjects\pythonProject1\DSMLBC8\donem_odevı\datasets\bmw.csv")
df_ford = pd.read_csv(r"C:\Users\Berat Arslan\PycharmProjects\pythonProject1\DSMLBC8\donem_odevı\datasets\ford.csv")
df_hyundai = pd.read_csv(r"C:\Users\Berat Arslan\PycharmProjects\pythonProject1\DSMLBC8\donem_odevı\datasets\hyundi.csv")
df_mercedes = pd.read_csv(r"C:\Users\Berat Arslan\PycharmProjects\pythonProject1\DSMLBC8\donem_odevı\datasets\merc.csv")  
df_skoda = pd.read_csv(r"C:\Users\Berat Arslan\PycharmProjects\pythonProject1\DSMLBC8\donem_odevı\datasets\skoda.csv")
df_toyota = pd.read_csv(r"C:\Users\Berat Arslan\PycharmProjects\pythonProject1\DSMLBC8\donem_odevı\datasets\toyota.csv")
df_vauxhall = pd.read_csv(r"C:\Users\Berat Arslan\PycharmProjects\pythonProject1\DSMLBC8\donem_odevı\datasets\vauxhall.csv")
df_vw = pd.read_csv(r"C:\Users\Berat Arslan\PycharmProjects\pythonProject1\DSMLBC8\donem_odevı\datasets\vw.csv")

### In Hyundai's dataset, the tax feature was given a different name. So we changed that.

In [3]:
df_hyundai['tax'] = 'A'
df_hyundai['tax'] = df_hyundai['tax(£)']
df_hyundai.head()
del df_hyundai['tax(£)']

In [4]:
df_hyundai.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,tax
0,I20,2017,7999,Manual,17307,Petrol,58.9,1.2,145
1,Tucson,2016,14499,Automatic,25233,Diesel,43.5,2.0,235
2,Tucson,2016,11399,Manual,37877,Diesel,61.7,1.7,30
3,I10,2016,6499,Manual,23789,Petrol,60.1,1.0,20
4,IX35,2015,10199,Manual,33177,Diesel,51.4,2.0,160


## Extraction of The 'brand' Feature

In [5]:
df_audi['brand'] = 'audi' 
df_bmw['brand'] = 'bmw'
df_ford['brand'] = 'ford'
df_hyundai['brand'] = 'hyundai'
df_mercedes['brand'] = 'mercedes'
df_skoda['brand'] = 'skoda'
df_toyota['brand'] = 'toyota'
df_vauxhall['brand'] = 'vauxhall'
df_vw['brand'] = 'vw'

## Combine All to One csv File

In [6]:
frames = [df_audi, df_bmw, df_ford, df_hyundai, df_mercedes, df_skoda, df_toyota, df_vauxhall, df_vw]
df = pd.concat(frames)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4,audi
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0,audi
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4,audi
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0,audi
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0,audi


In [7]:
df.isnull().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
brand           0
dtype: int64

## Converting 'year' to 'age'

In [8]:
df['year'] = 2021 - df['year']
df['age'] = df['year']
del df['year']
df.head()

Unnamed: 0,model,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand,age
0,A1,12500,Manual,15735,Petrol,150,55.4,1.4,audi,4
1,A6,16500,Automatic,36203,Diesel,20,64.2,2.0,audi,5
2,A1,11000,Manual,29946,Petrol,30,55.4,1.4,audi,5
3,A4,16800,Automatic,25952,Diesel,145,67.3,2.0,audi,4
4,A3,17300,Manual,1998,Petrol,145,49.6,1.0,audi,2


### When we draw the 'engineSize' and 'fuel consumption(mpg)' graphs, we observe that it is incompatible with real life values. That's why we're deleting the 'fuel consumption(mpg)' variable.

![indir.png](attachment:indir.png)

In [9]:
del df['mpg']

In [10]:
df.head()

Unnamed: 0,model,price,transmission,mileage,fuelType,tax,engineSize,brand,age
0,A1,12500,Manual,15735,Petrol,150,1.4,audi,4
1,A6,16500,Automatic,36203,Diesel,20,2.0,audi,5
2,A1,11000,Manual,29946,Petrol,30,1.4,audi,5
3,A4,16800,Automatic,25952,Diesel,145,2.0,audi,4
4,A3,17300,Manual,1998,Petrol,145,1.0,audi,2


In [11]:
df.describe([0.01,0.05,0.95,0.99]).T

Unnamed: 0,count,mean,std,min,1%,5%,50%,95%,99%,max
price,99187.0,16805.348,9866.773,450.0,3995.0,6495.0,14495.0,34400.0,52000.0,159999.0
mileage,99187.0,23058.914,21148.524,1.0,10.0,1000.0,17460.0,64763.7,96215.38,323000.0
tax,99187.0,120.3,63.151,0.0,0.0,0.0,145.0,200.0,265.0,580.0
engineSize,99187.0,1.663,0.558,0.0,1.0,1.0,1.6,3.0,3.0,6.6
age,99187.0,3.912,2.124,-39.0,1.0,2.0,4.0,8.0,11.0,51.0


In [12]:
df = df[df['engineSize']>0]
df = df[df['age']>0]

## Outliers

In [13]:
def outlier_thresholds(dataframe, col_name, q1=0.02, q3=0.98):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False
    
def grab_col_names(dataframe, cat_th=10, car_th=20):
    """

    Veri setindeki kategorik, numerik ve kategorik fakat kardinal değişkenlerin isimlerini verir.
    Not: Kategorik değişkenlerin içerisine numerik görünümlü kategorik değişkenler de dahildir.

    Parameters
    ------
        dataframe: dataframe
                Değişken isimleri alınmak istenilen dataframe
        cat_th: int, optional
                numerik fakat kategorik olan değişkenler için sınıf eşik değeri
        car_th: int, optinal
                kategorik fakat kardinal değişkenler için sınıf eşik değeri

    Returns
    ------
        cat_cols: list
                Kategorik değişken listesi
        num_cols: list
                Numerik değişken listesi
        cat_but_car: list
                Kategorik görünümlü kardinal değişken listesi

    Examples
    ------
        import seaborn as sns
        df = sns.load_dataset("iris")
        print(grab_col_names(df))


    Notes
    ------
        cat_cols + num_cols + cat_but_car = toplam değişken sayısı
        num_but_cat cat_cols'un içerisinde.
        Return olan 3 liste toplamı toplam değişken sayısına eşittir: cat_cols + num_cols + cat_but_car = değişken sayısı

    """

    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')
    return cat_cols, num_cols, cat_but_car

cat_cols, num_cols, cat_but_car = grab_col_names(df)

Observations: 98913
Variables: 9
cat_cols: 3
num_cols: 5
cat_but_car: 1
num_but_cat: 0


In [14]:
for col in num_cols:
    print(col, check_outlier(df, col))

price True
mileage True
tax False
engineSize True
age True


In [15]:
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [16]:
for col in num_cols:
    replace_with_thresholds(df, col)

In [17]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,98913.0,16795.723,9770.469,450.0,9999.0,14495.0,20875.0,104977.5
mileage,98913.0,23059.99,21108.949,1.0,7424.0,17456.0,32346.0,207453.5
tax,98913.0,120.322,63.121,0.0,125.0,145.0,145.0,580.0
engineSize,98913.0,1.668,0.551,0.6,1.2,1.6,2.0,6.0
age,98913.0,3.912,2.106,1.0,2.0,4.0,5.0,21.0


## Feature Extraction (Adding 2 New Features consisting of 9 Categories )

In [18]:
df.loc[(df['mileage']<7500)&(df['age']<3),'mileage_age'] = 'm1a1'

df.loc[(df['mileage']<7500)&(df['age']>=3)&(df['age']<=5),'mileage_age'] = 'm1a2'

df.loc[(df['mileage']<7500)&(df['age']>5),'mileage_age'] = 'm1a3'

In [19]:
df.loc[(df['mileage']>=7500)&(df['mileage']<32500)&(df['age']<3),'mileage_age'] = 'm2a1'

df.loc[(df['mileage']>=7500)&(df['mileage']<32500)&(df['age']>=3)&(df['age']<6),'mileage_age'] = 'm2a2'

df.loc[(df['mileage']>=7500)&(df['mileage']<32500)&(df['age']>5),'mileage_age'] = 'm2a3'

In [20]:
df.loc[(df['mileage']>=32500)&(df['age']<3),'mileage_age'] = 'm3a1'

df.loc[(df['mileage']>=32500)&(df['age']>=3)&(df['age']<=5),'mileage_age'] = 'm3a2'

df.loc[(df['mileage']>=32500)&(df['age']>5),'mileage_age'] = 'm3a3'

In [21]:
df['mileage_age'].unique()

array(['m2a2', 'm3a2', 'm1a1', 'm3a3', 'm2a3', 'm2a1', 'm1a2', 'm1a3',
       'm3a1'], dtype=object)

In [22]:
df.loc[(df['engineSize']<1300)&(df['age']<3),'eS_age'] = 'e1a1'

df.loc[(df['engineSize']<1300)&(df['age']>=3)&(df['age']<=5),'eS_age'] = 'e1a2'

df.loc[(df['engineSize']<1300)&(df['age']>5),'eS_age'] = 'e1a3'

In [23]:
df.loc[(df['engineSize']>=1300)&(df['engineSize']<2000)&(df['age']<3),'eS_age'] = 'e2a1'

df.loc[(df['engineSize']>=1300)&(df['engineSize']<2000)&(df['age']>=3)&(df['age']<6),'eS_age'] = 'e2a2'

df.loc[(df['engineSize']>=1300)&(df['engineSize']<2000)&(df['age']>5),'eS_age'] = 'e2a3'

In [24]:
df.loc[(df['engineSize']>=2000)&(df['age']<3),'eS_age'] = 'e3a1'

df.loc[(df['engineSize']>=2000)&(df['age']>=3)&(df['age']<=5),'eS_age'] = 'e3a2'

df.loc[(df['engineSize']>=2000)&(df['age']>5),'eS_age'] = 'e3a3'

In [25]:
df.head()

Unnamed: 0,model,price,transmission,mileage,fuelType,tax,engineSize,brand,age,mileage_age,eS_age
0,A1,12500.0,Manual,15735.0,Petrol,150.0,1.4,audi,4,m2a2,e1a2
1,A6,16500.0,Automatic,36203.0,Diesel,20.0,2.0,audi,5,m3a2,e1a2
2,A1,11000.0,Manual,29946.0,Petrol,30.0,1.4,audi,5,m2a2,e1a2
3,A4,16800.0,Automatic,25952.0,Diesel,145.0,2.0,audi,4,m2a2,e1a2
4,A3,17300.0,Manual,1998.0,Petrol,145.0,1.0,audi,2,m1a1,e1a1


In [26]:
df.isnull().any()

model           False
price           False
transmission    False
mileage         False
fuelType        False
tax             False
engineSize      False
brand           False
age             False
mileage_age     False
eS_age          False
dtype: bool

## Label Encoding & One-Hot Encoding

In [27]:
cat_cols, cat_but_car, num_cols = grab_col_names(df)

Observations: 98913
Variables: 11
cat_cols: 5
num_cols: 5
cat_but_car: 1
num_but_cat: 0


In [28]:
cat_cols.append('model')
cat_cols

['transmission', 'fuelType', 'brand', 'mileage_age', 'eS_age', 'model']

In [29]:
num_cols = ['price','engineSize','age','mileage','tax']

In [30]:
def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

In [31]:
binary_cols = [col for col in df.columns if df[col].dtypes == "O" and len(df[col].unique()) == 2]


In [32]:
for col in binary_cols:
    label_encoder(df, col)

In [33]:
def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

df = one_hot_encoder(df, cat_cols, drop_first=True)

df.head(1)

Unnamed: 0,price,mileage,tax,engineSize,age,transmission_Manual,transmission_Other,transmission_Semi-Auto,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,brand_bmw,brand_ford,brand_hyundai,brand_mercedes,brand_skoda,brand_toyota,brand_vauxhall,brand_vw,mileage_age_m1a2,mileage_age_m1a3,mileage_age_m2a1,mileage_age_m2a2,mileage_age_m2a3,mileage_age_m3a1,mileage_age_m3a2,mileage_age_m3a3,eS_age_e1a2,eS_age_e1a3,model_ 2 Series,model_ 3 Series,model_ 4 Series,model_ 5 Series,model_ 6 Series,model_ 7 Series,model_ 8 Series,model_ A Class,model_ A1,model_ A2,model_ A3,model_ A4,model_ A5,model_ A6,model_ A7,model_ A8,model_ Accent,model_ Adam,model_ Agila,model_ Amarok,model_ Amica,model_ Ampera,model_ Antara,model_ Arteon,model_ Astra,model_ Auris,model_ Avensis,model_ Aygo,model_ B Class,model_ B-MAX,model_ Beetle,model_ C Class,model_ C-HR,model_ C-MAX,model_ CC,model_ CL Class,model_ CLA Class,model_ CLC Class,model_ CLK,model_ CLS Class,model_ Caddy,model_ Caddy Life,model_ Caddy Maxi,model_ Caddy Maxi Life,model_ California,model_ Camry,model_ Caravelle,model_ Cascada,model_ Citigo,model_ Combo Life,model_ Corolla,model_ Corsa,model_ Crossland X,model_ E Class,model_ EcoSport,model_ Edge,model_ Eos,model_ Escort,model_ Fabia,model_ Fiesta,model_ Focus,model_ Fox,model_ Fusion,model_ G Class,model_ GL Class,model_ GLA Class,model_ GLB Class,model_ GLC Class,model_ GLE Class,model_ GLS Class,model_ GT86,model_ GTC,model_ Galaxy,model_ Getz,model_ Golf,model_ Golf SV,model_ Grand C-MAX,model_ Grand Tourneo Connect,model_ Grandland X,model_ Hilux,model_ I10,model_ I20,model_ I30,model_ I40,model_ I800,model_ IQ,model_ IX20,model_ IX35,model_ Insignia,model_ Ioniq,model_ Jetta,model_ KA,model_ Ka+,model_ Kadjar,model_ Kamiq,model_ Karoq,model_ Kodiaq,model_ Kona,model_ Kuga,model_ Land Cruiser,model_ M Class,model_ M2,model_ M3,model_ M4,model_ M5,model_ M6,model_ Meriva,model_ Mokka,model_ Mokka X,model_ Mondeo,model_ Mustang,model_ Octavia,model_ PROACE VERSO,model_ Passat,model_ Polo,model_ Prius,model_ Puma,model_ Q2,model_ Q3,model_ Q5,model_ Q7,model_ Q8,model_ R Class,model_ R8,model_ RAV4,model_ RS3,model_ RS4,model_ RS5,model_ RS6,model_ RS7,model_ Ranger,model_ Rapid,model_ Roomster,model_ S Class,model_ S-MAX,model_ S3,model_ S4,model_ S5,model_ S8,model_ SL CLASS,model_ SLK,model_ SQ5,model_ SQ7,model_ Santa Fe,model_ Scala,model_ Scirocco,model_ Sharan,model_ Shuttle,model_ Streetka,model_ Superb,model_ Supra,model_ T-Cross,model_ T-Roc,model_ TT,model_ Terracan,model_ Tigra,model_ Tiguan,model_ Tiguan Allspace,model_ Touareg,model_ Touran,model_ Tourneo Connect,model_ Tourneo Custom,model_ Transit Tourneo,model_ Tucson,model_ Up,model_ Urban Cruiser,model_ V Class,model_ Vectra,model_ Veloster,model_ Verso,model_ Verso-S,model_ Viva,model_ Vivaro,model_ X-CLASS,model_ X1,model_ X2,model_ X3,model_ X4,model_ X5,model_ X6,model_ X7,model_ Yaris,model_ Yeti,model_ Yeti Outdoor,model_ Z3,model_ Z4,model_ Zafira,model_ Zafira Tourer,model_ i3,model_ i8,model_180,model_200,model_220
0,12500.0,15735.0,150.0,1.4,4,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Log Transformation

In [34]:
y = np.log1p(df['price'])
X = df.drop(["price"], axis=1)


## Splitting the dataset into 'train' and 'test'

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=17)


## Running These Regression Models for The Best RMSE Results

In [36]:
models = [('LR', LinearRegression()),
          ("Ridge", Ridge()),
          ("Lasso", Lasso()),
          ("ElasticNet", ElasticNet()),
          ('KNN', KNeighborsRegressor()),
          ('CART', DecisionTreeRegressor()),
          ('RF', RandomForestRegressor()),
          #('SVR', SVR()),
          ('GBM', GradientBoostingRegressor()),
          ("XGBoost", XGBRegressor(objective='reg:squarederror')),
          ("LightGBM", LGBMRegressor()),
         ("CatBoost", CatBoostRegressor(verbose=False))]
          

In [37]:
for name, regressor in models:
    rmse = np.mean(np.sqrt(-cross_val_score(regressor, X, y, cv=10, scoring="neg_mean_squared_error")))
    print(f"RMSE: {round(rmse, 4)} ({name}) ")

RMSE: 0.1899 (LR) 
RMSE: 0.1901 (Ridge) 
RMSE: 0.4576 (Lasso) 
RMSE: 0.4573 (ElasticNet) 
RMSE: 0.4297 (KNN) 
RMSE: 0.2215 (CART) 
RMSE: 0.1866 (RF) 
RMSE: 0.2138 (GBM) 
RMSE: 0.1736 (XGBoost) 
RMSE: 0.1855 (LightGBM) 
RMSE: 0.1653 (CatBoost) 


## Modelling

In [38]:
cat_boost_model = CatBoostRegressor(verbose=False)

In [39]:
rmse = np.mean(np.sqrt(-cross_val_score(cat_boost_model, X, y, cv=10, scoring="neg_mean_squared_error")))


In [40]:
rmse

0.16531946736639985

## Hyperparameter Optimization for the Best-Performing 'Catboost Regressor' Model

In [41]:
parameters = {'depth'         : [7,8,9],
              'learning_rate' : [0.11, 0.12, 0.13],
              'iterations'    : [1600, 1700, 1800]}


In [42]:
cat_boost_model_best = GridSearchCV(cat_boost_model,parameters,cv=5,n_jobs=-1,verbose=True).fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [43]:
cat_boost_model_best.best_params_

{'depth': 9, 'iterations': 1800, 'learning_rate': 0.13}

In [None]:
#A =  cat_boost_model_best.best_params_

In [None]:
#A = {'depth': 9, 'iterations': 1800, 'learning_rate': 0.13}
#A

In [44]:
final_model = cat_boost_model.set_params(**cat_boost_model_best.best_params_).fit(X, y)


In [45]:
rmse = np.mean(np.sqrt(-cross_val_score(final_model, X, y, cv=10, scoring="neg_mean_squared_error")))

In [46]:
rmse

0.16350533259082684

## Final Model

In [47]:
cat_boost_tuned = CatBoostRegressor(**cat_boost_model_best.best_params_).fit(X_train, y_train)
y_pred = cat_boost_tuned.predict(X_test)

0:	learn: 0.4784033	total: 9.67ms	remaining: 17.4s
1:	learn: 0.4323252	total: 18.9ms	remaining: 17s
2:	learn: 0.3918742	total: 27.7ms	remaining: 16.6s
3:	learn: 0.3573777	total: 36.3ms	remaining: 16.3s
4:	learn: 0.3294684	total: 44.6ms	remaining: 16s
5:	learn: 0.3060873	total: 52.8ms	remaining: 15.8s
6:	learn: 0.2857302	total: 61.2ms	remaining: 15.7s
7:	learn: 0.2688979	total: 69.4ms	remaining: 15.5s
8:	learn: 0.2536912	total: 77.9ms	remaining: 15.5s
9:	learn: 0.2414138	total: 86ms	remaining: 15.4s
10:	learn: 0.2305249	total: 94.5ms	remaining: 15.4s
11:	learn: 0.2218419	total: 103ms	remaining: 15.3s
12:	learn: 0.2134212	total: 111ms	remaining: 15.3s
13:	learn: 0.2069510	total: 119ms	remaining: 15.2s
14:	learn: 0.2011072	total: 128ms	remaining: 15.2s
15:	learn: 0.1949374	total: 136ms	remaining: 15.2s
16:	learn: 0.1904152	total: 144ms	remaining: 15.1s
17:	learn: 0.1865576	total: 152ms	remaining: 15.1s
18:	learn: 0.1834454	total: 161ms	remaining: 15s
19:	learn: 0.1800837	total: 169ms	rema

183:	learn: 0.1116180	total: 1.51s	remaining: 13.3s
184:	learn: 0.1115199	total: 1.52s	remaining: 13.3s
185:	learn: 0.1114438	total: 1.53s	remaining: 13.3s
186:	learn: 0.1113211	total: 1.54s	remaining: 13.3s
187:	learn: 0.1112701	total: 1.55s	remaining: 13.3s
188:	learn: 0.1111708	total: 1.56s	remaining: 13.3s
189:	learn: 0.1110734	total: 1.57s	remaining: 13.3s
190:	learn: 0.1109279	total: 1.58s	remaining: 13.3s
191:	learn: 0.1108557	total: 1.58s	remaining: 13.3s
192:	learn: 0.1107447	total: 1.59s	remaining: 13.3s
193:	learn: 0.1106208	total: 1.6s	remaining: 13.3s
194:	learn: 0.1105313	total: 1.61s	remaining: 13.2s
195:	learn: 0.1104181	total: 1.62s	remaining: 13.2s
196:	learn: 0.1103248	total: 1.63s	remaining: 13.2s
197:	learn: 0.1102208	total: 1.63s	remaining: 13.2s
198:	learn: 0.1101176	total: 1.64s	remaining: 13.2s
199:	learn: 0.1100668	total: 1.65s	remaining: 13.2s
200:	learn: 0.1099670	total: 1.66s	remaining: 13.2s
201:	learn: 0.1098331	total: 1.67s	remaining: 13.2s
202:	learn: 0

360:	learn: 0.0999686	total: 3s	remaining: 12s
361:	learn: 0.0999329	total: 3.01s	remaining: 12s
362:	learn: 0.0999201	total: 3.02s	remaining: 11.9s
363:	learn: 0.0998675	total: 3.03s	remaining: 11.9s
364:	learn: 0.0998151	total: 3.04s	remaining: 11.9s
365:	learn: 0.0997789	total: 3.04s	remaining: 11.9s
366:	learn: 0.0997254	total: 3.05s	remaining: 11.9s
367:	learn: 0.0996886	total: 3.06s	remaining: 11.9s
368:	learn: 0.0996695	total: 3.07s	remaining: 11.9s
369:	learn: 0.0996479	total: 3.07s	remaining: 11.9s
370:	learn: 0.0996035	total: 3.08s	remaining: 11.9s
371:	learn: 0.0995910	total: 3.09s	remaining: 11.9s
372:	learn: 0.0995383	total: 3.1s	remaining: 11.9s
373:	learn: 0.0995060	total: 3.11s	remaining: 11.8s
374:	learn: 0.0994517	total: 3.12s	remaining: 11.8s
375:	learn: 0.0994229	total: 3.12s	remaining: 11.8s
376:	learn: 0.0993680	total: 3.13s	remaining: 11.8s
377:	learn: 0.0993317	total: 3.14s	remaining: 11.8s
378:	learn: 0.0993117	total: 3.15s	remaining: 11.8s
379:	learn: 0.099272

540:	learn: 0.0948585	total: 4.5s	remaining: 10.5s
541:	learn: 0.0948236	total: 4.5s	remaining: 10.5s
542:	learn: 0.0948046	total: 4.51s	remaining: 10.4s
543:	learn: 0.0947811	total: 4.52s	remaining: 10.4s
544:	learn: 0.0947494	total: 4.53s	remaining: 10.4s
545:	learn: 0.0947394	total: 4.54s	remaining: 10.4s
546:	learn: 0.0947325	total: 4.55s	remaining: 10.4s
547:	learn: 0.0947041	total: 4.55s	remaining: 10.4s
548:	learn: 0.0946776	total: 4.56s	remaining: 10.4s
549:	learn: 0.0946729	total: 4.57s	remaining: 10.4s
550:	learn: 0.0946409	total: 4.58s	remaining: 10.4s
551:	learn: 0.0946089	total: 4.59s	remaining: 10.4s
552:	learn: 0.0945862	total: 4.6s	remaining: 10.4s
553:	learn: 0.0945673	total: 4.61s	remaining: 10.4s
554:	learn: 0.0945285	total: 4.62s	remaining: 10.4s
555:	learn: 0.0945143	total: 4.62s	remaining: 10.3s
556:	learn: 0.0944739	total: 4.63s	remaining: 10.3s
557:	learn: 0.0944400	total: 4.64s	remaining: 10.3s
558:	learn: 0.0944364	total: 4.65s	remaining: 10.3s
559:	learn: 0.0

721:	learn: 0.0915031	total: 6.01s	remaining: 8.97s
722:	learn: 0.0914903	total: 6.02s	remaining: 8.96s
723:	learn: 0.0914741	total: 6.03s	remaining: 8.95s
724:	learn: 0.0914505	total: 6.03s	remaining: 8.95s
725:	learn: 0.0914420	total: 6.04s	remaining: 8.94s
726:	learn: 0.0914339	total: 6.05s	remaining: 8.93s
727:	learn: 0.0914240	total: 6.06s	remaining: 8.92s
728:	learn: 0.0914033	total: 6.07s	remaining: 8.91s
729:	learn: 0.0913937	total: 6.07s	remaining: 8.9s
730:	learn: 0.0913795	total: 6.08s	remaining: 8.89s
731:	learn: 0.0913549	total: 6.09s	remaining: 8.88s
732:	learn: 0.0913465	total: 6.1s	remaining: 8.87s
733:	learn: 0.0913235	total: 6.1s	remaining: 8.87s
734:	learn: 0.0913023	total: 6.11s	remaining: 8.86s
735:	learn: 0.0912871	total: 6.12s	remaining: 8.85s
736:	learn: 0.0912736	total: 6.13s	remaining: 8.84s
737:	learn: 0.0912581	total: 6.14s	remaining: 8.83s
738:	learn: 0.0912375	total: 6.14s	remaining: 8.82s
739:	learn: 0.0912306	total: 6.15s	remaining: 8.81s
740:	learn: 0.0

899:	learn: 0.0892163	total: 7.48s	remaining: 7.48s
900:	learn: 0.0892044	total: 7.49s	remaining: 7.47s
901:	learn: 0.0891902	total: 7.5s	remaining: 7.46s
902:	learn: 0.0891706	total: 7.5s	remaining: 7.45s
903:	learn: 0.0891564	total: 7.51s	remaining: 7.45s
904:	learn: 0.0891357	total: 7.52s	remaining: 7.44s
905:	learn: 0.0891263	total: 7.53s	remaining: 7.43s
906:	learn: 0.0891089	total: 7.54s	remaining: 7.42s
907:	learn: 0.0891076	total: 7.54s	remaining: 7.41s
908:	learn: 0.0890998	total: 7.55s	remaining: 7.4s
909:	learn: 0.0890902	total: 7.56s	remaining: 7.39s
910:	learn: 0.0890888	total: 7.57s	remaining: 7.39s
911:	learn: 0.0890821	total: 7.58s	remaining: 7.38s
912:	learn: 0.0890702	total: 7.59s	remaining: 7.37s
913:	learn: 0.0890626	total: 7.59s	remaining: 7.36s
914:	learn: 0.0890520	total: 7.6s	remaining: 7.35s
915:	learn: 0.0890385	total: 7.61s	remaining: 7.34s
916:	learn: 0.0890335	total: 7.62s	remaining: 7.33s
917:	learn: 0.0890268	total: 7.63s	remaining: 7.33s
918:	learn: 0.08

1062:	learn: 0.0875734	total: 8.8s	remaining: 6.1s
1063:	learn: 0.0875710	total: 8.8s	remaining: 6.09s
1064:	learn: 0.0875643	total: 8.81s	remaining: 6.08s
1065:	learn: 0.0875567	total: 8.82s	remaining: 6.07s
1066:	learn: 0.0875517	total: 8.83s	remaining: 6.07s
1067:	learn: 0.0875493	total: 8.84s	remaining: 6.06s
1068:	learn: 0.0875486	total: 8.84s	remaining: 6.05s
1069:	learn: 0.0875374	total: 8.85s	remaining: 6.04s
1070:	learn: 0.0875300	total: 8.86s	remaining: 6.03s
1071:	learn: 0.0875211	total: 8.87s	remaining: 6.02s
1072:	learn: 0.0875143	total: 8.88s	remaining: 6.01s
1073:	learn: 0.0875061	total: 8.88s	remaining: 6s
1074:	learn: 0.0874998	total: 8.89s	remaining: 6s
1075:	learn: 0.0874858	total: 8.9s	remaining: 5.99s
1076:	learn: 0.0874808	total: 8.91s	remaining: 5.98s
1077:	learn: 0.0874693	total: 8.92s	remaining: 5.97s
1078:	learn: 0.0874598	total: 8.93s	remaining: 5.96s
1079:	learn: 0.0874526	total: 8.93s	remaining: 5.96s
1080:	learn: 0.0874448	total: 8.94s	remaining: 5.95s
108

1223:	learn: 0.0862401	total: 10.1s	remaining: 4.75s
1224:	learn: 0.0862341	total: 10.1s	remaining: 4.75s
1225:	learn: 0.0862271	total: 10.1s	remaining: 4.74s
1226:	learn: 0.0862244	total: 10.1s	remaining: 4.73s
1227:	learn: 0.0862195	total: 10.1s	remaining: 4.72s
1228:	learn: 0.0862153	total: 10.1s	remaining: 4.71s
1229:	learn: 0.0862073	total: 10.2s	remaining: 4.71s
1230:	learn: 0.0861998	total: 10.2s	remaining: 4.7s
1231:	learn: 0.0861892	total: 10.2s	remaining: 4.69s
1232:	learn: 0.0861834	total: 10.2s	remaining: 4.68s
1233:	learn: 0.0861725	total: 10.2s	remaining: 4.67s
1234:	learn: 0.0861634	total: 10.2s	remaining: 4.66s
1235:	learn: 0.0861573	total: 10.2s	remaining: 4.66s
1236:	learn: 0.0861503	total: 10.2s	remaining: 4.65s
1237:	learn: 0.0861445	total: 10.2s	remaining: 4.64s
1238:	learn: 0.0861390	total: 10.2s	remaining: 4.63s
1239:	learn: 0.0861289	total: 10.2s	remaining: 4.62s
1240:	learn: 0.0861258	total: 10.2s	remaining: 4.61s
1241:	learn: 0.0861140	total: 10.3s	remaining: 

1380:	learn: 0.0851299	total: 11.4s	remaining: 3.46s
1381:	learn: 0.0851199	total: 11.4s	remaining: 3.45s
1382:	learn: 0.0851170	total: 11.4s	remaining: 3.45s
1383:	learn: 0.0851110	total: 11.4s	remaining: 3.44s
1384:	learn: 0.0851027	total: 11.4s	remaining: 3.43s
1385:	learn: 0.0850905	total: 11.5s	remaining: 3.42s
1386:	learn: 0.0850874	total: 11.5s	remaining: 3.41s
1387:	learn: 0.0850817	total: 11.5s	remaining: 3.4s
1388:	learn: 0.0850730	total: 11.5s	remaining: 3.4s
1389:	learn: 0.0850664	total: 11.5s	remaining: 3.39s
1390:	learn: 0.0850631	total: 11.5s	remaining: 3.38s
1391:	learn: 0.0850612	total: 11.5s	remaining: 3.37s
1392:	learn: 0.0850499	total: 11.5s	remaining: 3.36s
1393:	learn: 0.0850438	total: 11.5s	remaining: 3.35s
1394:	learn: 0.0850369	total: 11.5s	remaining: 3.35s
1395:	learn: 0.0850301	total: 11.5s	remaining: 3.34s
1396:	learn: 0.0850234	total: 11.5s	remaining: 3.33s
1397:	learn: 0.0850224	total: 11.6s	remaining: 3.32s
1398:	learn: 0.0850147	total: 11.6s	remaining: 3

1536:	learn: 0.0841272	total: 12.7s	remaining: 2.17s
1537:	learn: 0.0841235	total: 12.7s	remaining: 2.17s
1538:	learn: 0.0841147	total: 12.7s	remaining: 2.16s
1539:	learn: 0.0841116	total: 12.7s	remaining: 2.15s
1540:	learn: 0.0841092	total: 12.7s	remaining: 2.14s
1541:	learn: 0.0841036	total: 12.8s	remaining: 2.13s
1542:	learn: 0.0840984	total: 12.8s	remaining: 2.13s
1543:	learn: 0.0840904	total: 12.8s	remaining: 2.12s
1544:	learn: 0.0840878	total: 12.8s	remaining: 2.11s
1545:	learn: 0.0840782	total: 12.8s	remaining: 2.1s
1546:	learn: 0.0840684	total: 12.8s	remaining: 2.09s
1547:	learn: 0.0840617	total: 12.8s	remaining: 2.08s
1548:	learn: 0.0840566	total: 12.8s	remaining: 2.08s
1549:	learn: 0.0840478	total: 12.8s	remaining: 2.07s
1550:	learn: 0.0840454	total: 12.8s	remaining: 2.06s
1551:	learn: 0.0840401	total: 12.8s	remaining: 2.05s
1552:	learn: 0.0840359	total: 12.8s	remaining: 2.04s
1553:	learn: 0.0840233	total: 12.9s	remaining: 2.03s
1554:	learn: 0.0840197	total: 12.9s	remaining: 

1693:	learn: 0.0832446	total: 14s	remaining: 876ms
1694:	learn: 0.0832388	total: 14s	remaining: 868ms
1695:	learn: 0.0832371	total: 14s	remaining: 860ms
1696:	learn: 0.0832332	total: 14s	remaining: 852ms
1697:	learn: 0.0832254	total: 14s	remaining: 843ms
1698:	learn: 0.0832179	total: 14s	remaining: 835ms
1699:	learn: 0.0832107	total: 14.1s	remaining: 827ms
1700:	learn: 0.0831958	total: 14.1s	remaining: 819ms
1701:	learn: 0.0831915	total: 14.1s	remaining: 810ms
1702:	learn: 0.0831855	total: 14.1s	remaining: 802ms
1703:	learn: 0.0831778	total: 14.1s	remaining: 794ms
1704:	learn: 0.0831765	total: 14.1s	remaining: 785ms
1705:	learn: 0.0831690	total: 14.1s	remaining: 777ms
1706:	learn: 0.0831626	total: 14.1s	remaining: 769ms
1707:	learn: 0.0831584	total: 14.1s	remaining: 761ms
1708:	learn: 0.0831540	total: 14.1s	remaining: 752ms
1709:	learn: 0.0831471	total: 14.1s	remaining: 744ms
1710:	learn: 0.0831433	total: 14.1s	remaining: 736ms
1711:	learn: 0.0831373	total: 14.2s	remaining: 728ms
1712:

## Inversing the log Transform

In [48]:
new_y = np.expm1(y_pred)
new_y_test = np.expm1(y_test)

np.sqrt(mean_squared_error(new_y_test, new_y))

1979.1822396958178

 ## Testing The Final Model

In [49]:
# Train RMSE
y_pred = final_model.predict(X_train)
np.sqrt(mean_squared_error(y_train, y_pred))

0.08386816976279211

In [50]:
# TRAIN RKARE
final_model.score(X_train, y_train)

0.9751910691476736

In [51]:
# Test RMSE
y_pred = final_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

0.08477066463170284

In [52]:
# Test RKARE
final_model.score(X_test, y_test)

0.9754910530266766

## Prediction for a New Observation

In [63]:
df.columns
random_user = X.sample(1)
#np.array(10**final_model.predict(random_user))

## Saving The Model

In [None]:
joblib.dump(final_model, "final_model.pkl")