# **Car Price Prediction Using Machine Learning Algorithms**

#### Overview
1. Import libraries
2. Load dataset
3. Explore & clean the dataset 
4. Data preprocessing
5. Train & Evaluate model
6. Improve model performance
7. Save Model

-----

### **<p style="color:  #4ceae5 ">Import Libraries</p>**

In [1]:
import re
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

### **<p style="color:  #4ceae5 ">Load The Dataset</p>**

In [2]:
df = pd.read_csv('Data/car_price_data.csv')
df

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,46120
11910,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,56670
11911,Acura,ZDX,2012,premium unleaded (required),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,50620
11912,Acura,ZDX,2013,premium unleaded (recommended),300.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Hatchback,Luxury",Midsize,4dr Hatchback,23,16,204,50920


### **<p style="color:  #4ceae5 ">Explore & Clean The Dataset</p>**

##### Data General Description

In [3]:
# Main description of the dataset
df.describe(include='all')

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
count,11914,11914,11914.0,11911,11845.0,11884.0,11914,11914,11908.0,8172,11914,11914,11914.0,11914.0,11914.0,11914.0
unique,48,915,,10,,,5,4,,71,3,16,,,,
top,Chevrolet,Silverado 1500,,regular unleaded,,,AUTOMATIC,front wheel drive,,Crossover,Compact,Sedan,,,,
freq,1123,156,,7172,,,8266,4787,,1110,4764,3048,,,,
mean,,,2010.384338,,249.38607,5.628829,,,3.436093,,,,26.637485,19.733255,1554.911197,40594.74
std,,,7.57974,,109.19187,1.780559,,,0.881315,,,,8.863001,8.987798,1441.855347,60109.1
min,,,1990.0,,55.0,0.0,,,2.0,,,,12.0,7.0,2.0,2000.0
25%,,,2007.0,,170.0,4.0,,,2.0,,,,22.0,16.0,549.0,21000.0
50%,,,2015.0,,227.0,6.0,,,4.0,,,,26.0,18.0,1385.0,29995.0
75%,,,2016.0,,300.0,6.0,,,4.0,,,,30.0,22.0,2009.0,42231.25


In [4]:
# Observe the data type of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  object 
 1   Model              11914 non-null  object 
 2   Year               11914 non-null  int64  
 3   Engine Fuel Type   11911 non-null  object 
 4   Engine HP          11845 non-null  float64
 5   Engine Cylinders   11884 non-null  float64
 6   Transmission Type  11914 non-null  object 
 7   Driven_Wheels      11914 non-null  object 
 8   Number of Doors    11908 non-null  float64
 9   Market Category    8172 non-null   object 
 10  Vehicle Size       11914 non-null  object 
 11  Vehicle Style      11914 non-null  object 
 12  highway MPG        11914 non-null  int64  
 13  city mpg           11914 non-null  int64  
 14  Popularity         11914 non-null  int64  
 15  MSRP               11914 non-null  int64  
dtypes: float64(3), int64(5

In [5]:
# Number of unique value of each column
df.nunique()

Make                   48
Model                 915
Year                   28
Engine Fuel Type       10
Engine HP             356
Engine Cylinders        9
Transmission Type       5
Driven_Wheels           4
Number of Doors         3
Market Category        71
Vehicle Size            3
Vehicle Style          16
highway MPG            59
city mpg               69
Popularity             48
MSRP                 6049
dtype: int64

In [6]:
# number of NaN/NA/null value in each column 
df.isnull().sum()

Make                    0
Model                   0
Year                    0
Engine Fuel Type        3
Engine HP              69
Engine Cylinders       30
Transmission Type       0
Driven_Wheels           0
Number of Doors         6
Market Category      3742
Vehicle Size            0
Vehicle Style           0
highway MPG             0
city mpg                0
Popularity              0
MSRP                    0
dtype: int64

##### Data Manipulation

In [7]:
# replace spaces with underscore in each categorical value and column's name
df.columns = df.columns.str.lower().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')
df

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
0,bmw,1_series_m,2011,premium_unleaded_(required),335.0,6.0,manual,rear_wheel_drive,2.0,"factory_tuner,luxury,high-performance",compact,coupe,26,19,3916,46135
1,bmw,1_series,2011,premium_unleaded_(required),300.0,6.0,manual,rear_wheel_drive,2.0,"luxury,performance",compact,convertible,28,19,3916,40650
2,bmw,1_series,2011,premium_unleaded_(required),300.0,6.0,manual,rear_wheel_drive,2.0,"luxury,high-performance",compact,coupe,28,20,3916,36350
3,bmw,1_series,2011,premium_unleaded_(required),230.0,6.0,manual,rear_wheel_drive,2.0,"luxury,performance",compact,coupe,28,18,3916,29450
4,bmw,1_series,2011,premium_unleaded_(required),230.0,6.0,manual,rear_wheel_drive,2.0,luxury,compact,convertible,28,18,3916,34500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11909,acura,zdx,2012,premium_unleaded_(required),300.0,6.0,automatic,all_wheel_drive,4.0,"crossover,hatchback,luxury",midsize,4dr_hatchback,23,16,204,46120
11910,acura,zdx,2012,premium_unleaded_(required),300.0,6.0,automatic,all_wheel_drive,4.0,"crossover,hatchback,luxury",midsize,4dr_hatchback,23,16,204,56670
11911,acura,zdx,2012,premium_unleaded_(required),300.0,6.0,automatic,all_wheel_drive,4.0,"crossover,hatchback,luxury",midsize,4dr_hatchback,23,16,204,50620
11912,acura,zdx,2013,premium_unleaded_(recommended),300.0,6.0,automatic,all_wheel_drive,4.0,"crossover,hatchback,luxury",midsize,4dr_hatchback,23,16,204,50920


In [8]:
# Fill NA/NaN/Null values 
df['market_category'] = df['market_category'].fillna('unknown')
df['engine_fuel_type'] = df['engine_fuel_type'].fillna('unknown')
df['engine_hp'] = df['engine_hp'].fillna(df['engine_hp'].mode().values[0])
df['engine_cylinders'] = df['engine_cylinders'].fillna(df['engine_cylinders'].mode().values[0])
df['number_of_doors'] = df['number_of_doors'].fillna(df['number_of_doors'].mode().values[0])

In [9]:
# Change column's dtype
df['number_of_doors'] = df['number_of_doors'].astype(int).astype(str) + '_doors'
df['engine_cylinders'] = df['engine_cylinders'].astype(int).astype(str) + '_engine_cylinders'

# age of the cars
df['year'] = 2017 - df['year']

In [10]:
# Make lists of categorical and numerical column
df_categorical = df.select_dtypes(exclude=['int','float'])
df_numerical = df.select_dtypes(include=['int','float'])

categorical = df_categorical.columns
numerical = df_numerical.columns

print(categorical)
print(numerical)

Index(['make', 'model', 'engine_fuel_type', 'engine_cylinders',
       'transmission_type', 'driven_wheels', 'number_of_doors',
       'market_category', 'vehicle_size', 'vehicle_style'],
      dtype='object')
Index(['year', 'engine_hp', 'highway_mpg', 'city_mpg', 'popularity', 'msrp'], dtype='object')


##### Data Visualization

In [11]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Visualize data distribution of numerical column
fig = make_subplots(
                    rows=2, cols=4,
                    column_widths=[0.25, 0.25, 0.25, 0.25],
                    row_heights=[0.5, 0.5],
                    specs=[[{"type": "bar"}, {"type": "bar"}, {"type": "bar"}, {"type": "bar"}],
                           [{"type": "bar"}, {"type": "bar"},       None     ,       None     ]], 
                    horizontal_spacing=0.05, vertical_spacing=0.20,
                    subplot_titles=(re.sub('_', ' ', numerical[0]).title(),
                                    re.sub('_', ' ', numerical[1]).title(), 
                                    re.sub('_', ' ', numerical[2]).title(),
                                    re.sub('_', ' ', numerical[3]).title(),
                                    re.sub('_', ' ', numerical[4]).title(),
                                    re.sub('_', ' ', numerical[5]).title(),),
)

fig.add_trace(go.Histogram(x=df[numerical[0]], nbinsx=50), row=1, col=1)
fig.add_trace(go.Histogram(x=df[numerical[1]], nbinsx=300), row=1, col=2)
fig.add_trace(go.Histogram(x=df[numerical[2]], nbinsx=60), row=1, col=3)
fig.add_trace(go.Histogram(x=df[numerical[3]], nbinsx=70), row=1, col=4)
fig.add_trace(go.Histogram(x=df[numerical[4]], nbinsx=300), row=2, col=1)
fig.add_trace(go.Histogram(x=df[numerical[5]], nbinsx=1000), row=2, col=2)

fig.update_annotations(font_size=10)
fig.update_layout(title='Distribution of Numerical Variables', height=500, width=1050, showlegend = False, font={'size':8},)
fig.show()

In [12]:
# Visualize data distribution of categorical column
for i in categorical:
    fig = go.Figure([go.Bar(x=df[i].value_counts().values[:20], 
                            y=df[i].value_counts().index[:20], 
                            orientation='h',
    )])

    fig.update_xaxes(title='count')
    fig.update_layout(title='Distribution of ' + re.sub('_', ' ', i).title() + ' Variable', 
                    width=1000, height=500)
    fig.show()

In [13]:
# Visualize the relationship between price/msrp and numerical column
for i in numerical:
    if i != 'msrp':
        fig = go.Figure(data=go.Scatter(x=df[i], y=df['msrp'], mode='markers'))
        fig.update_xaxes(title=i)
        fig.update_yaxes(title='msrp')
        fig.update_layout(title='Relationship between ' + re.sub('_', ' ', i).title() + ' and Price', 
                        width=1000, height=500)
        fig.show()

In [14]:
# mean value of price in each unique value of make column
# we can use this to picture relatioship between price and categorical value
df[['make', 'msrp']].groupby('make', as_index=False).mean().sort_values('msrp', ascending=False).reset_index(drop=True)[:10]

Unnamed: 0,make,msrp
0,bugatti,1757224.0
1,maybach,546221.9
2,rolls-royce,351130.6
3,lamborghini,331567.3
4,bentley,247169.3
5,mclaren,239805.0
6,ferrari,238218.8
7,spyker,213323.3
8,aston_martin,197910.4
9,maserati,114207.7


In [15]:
# Another approach to draw the relationship between numerical value (price) and categorical value is using boxplot
for i in categorical:
    if df[i].nunique() < 10:
        fig = go.Figure(data=go.Box(x=df['msrp'], y=df[i], boxmean=True, orientation='h'))
        fig.update_xaxes(title='msrp')
        fig.update_layout(title='Relationship between ' + re.sub('_', ' ', i).title() + ' and Price', 
                        width=1000, height=500)
        fig.show()

In [16]:
# Cleaned data
# return column's format to the previous one
df['number_of_doors'] = df['number_of_doors'].str.replace('_doors', '')
df['engine_cylinders'] = df['engine_cylinders'].str.replace('_engine_cylinders', '')

df_cleaned = df
df_cleaned

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,msrp
0,bmw,1_series_m,6,premium_unleaded_(required),335.0,6,manual,rear_wheel_drive,2,"factory_tuner,luxury,high-performance",compact,coupe,26,19,3916,46135
1,bmw,1_series,6,premium_unleaded_(required),300.0,6,manual,rear_wheel_drive,2,"luxury,performance",compact,convertible,28,19,3916,40650
2,bmw,1_series,6,premium_unleaded_(required),300.0,6,manual,rear_wheel_drive,2,"luxury,high-performance",compact,coupe,28,20,3916,36350
3,bmw,1_series,6,premium_unleaded_(required),230.0,6,manual,rear_wheel_drive,2,"luxury,performance",compact,coupe,28,18,3916,29450
4,bmw,1_series,6,premium_unleaded_(required),230.0,6,manual,rear_wheel_drive,2,luxury,compact,convertible,28,18,3916,34500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11909,acura,zdx,5,premium_unleaded_(required),300.0,6,automatic,all_wheel_drive,4,"crossover,hatchback,luxury",midsize,4dr_hatchback,23,16,204,46120
11910,acura,zdx,5,premium_unleaded_(required),300.0,6,automatic,all_wheel_drive,4,"crossover,hatchback,luxury",midsize,4dr_hatchback,23,16,204,56670
11911,acura,zdx,5,premium_unleaded_(required),300.0,6,automatic,all_wheel_drive,4,"crossover,hatchback,luxury",midsize,4dr_hatchback,23,16,204,50620
11912,acura,zdx,4,premium_unleaded_(recommended),300.0,6,automatic,all_wheel_drive,4,"crossover,hatchback,luxury",midsize,4dr_hatchback,23,16,204,50920


### **<p style="color:#4ceae5">Data Preprocessing</p>**

##### Transform Data

In [17]:
# convert numerical feature to it's log value
for i in numerical:
    df_cleaned[i] = np.log1p(df_cleaned[i])

In [18]:
# one hot encoding categorical features
df_transformed = pd.get_dummies(df_cleaned)
df_transformed

Unnamed: 0,year,engine_hp,highway_mpg,city_mpg,popularity,msrp,make_acura,make_alfa_romeo,make_aston_martin,make_audi,...,vehicle_style_convertible,vehicle_style_convertible_suv,vehicle_style_coupe,vehicle_style_crew_cab_pickup,vehicle_style_extended_cab_pickup,vehicle_style_passenger_minivan,vehicle_style_passenger_van,vehicle_style_regular_cab_pickup,vehicle_style_sedan,vehicle_style_wagon
0,1.945910,5.817111,3.295837,2.995732,8.273081,10.739349,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1.945910,5.707110,3.367296,2.995732,8.273081,10.612779,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1.945910,5.707110,3.367296,3.044522,8.273081,10.500977,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,1.945910,5.442418,3.367296,2.944439,8.273081,10.290483,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,1.945910,5.442418,3.367296,2.944439,8.273081,10.448744,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11909,1.791759,5.707110,3.178054,2.833213,5.323010,10.739024,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11910,1.791759,5.707110,3.178054,2.833213,5.323010,10.945018,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11911,1.791759,5.707110,3.178054,2.833213,5.323010,10.832122,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11912,1.609438,5.707110,3.178054,2.833213,5.323010,10.838031,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### Split Train and Test Data

In [19]:
# split features and target value first
df_features = df_transformed.drop('msrp', axis=1)
df_target = df_transformed['msrp']

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_features.iloc[:-1, :], df_target[:-1], random_state=123)

In [21]:
X_train.head()

Unnamed: 0,year,engine_hp,highway_mpg,city_mpg,popularity,make_acura,make_alfa_romeo,make_aston_martin,make_audi,make_bentley,...,vehicle_style_convertible,vehicle_style_convertible_suv,vehicle_style_coupe,vehicle_style_crew_cab_pickup,vehicle_style_extended_cab_pickup,vehicle_style_passenger_minivan,vehicle_style_passenger_van,vehicle_style_regular_cab_pickup,vehicle_style_sedan,vehicle_style_wagon
11077,2.484907,5.283204,3.091042,2.772589,7.234177,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
7164,0.0,5.942799,2.995732,2.772589,4.127134,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1491,1.098612,5.609472,3.401197,3.044522,7.393263,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
11558,1.098612,5.484797,3.465736,3.218876,6.769642,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5634,1.791759,5.303305,3.526361,3.218876,6.77308,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(8934, 1090)
(8934,)
(2979, 1090)
(2979,)


### **<p style="color:#4ceae5">Train & Evaluate The Model</p>**

##### Xgboost (Extreme Gradient Boosting)

In [23]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

xgb_reg = XGBRegressor(
                        booster='gbtree', 
                        objective='reg:squarederror',
)

eval_set = [(X_train, y_train), (X_test, y_test)]
xgb_reg.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="rmse", eval_set=eval_set, verbose=True)

# Evaluating the model
xgb_score = xgb_reg.score(X_test, y_test)
print(f'Test Score: {xgb_score:.4f}')

y_pred = xgb_reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("Test RMSE: %0.4f" % (rmse))

[0]	validation_0-rmse:6.77811	validation_1-rmse:6.78164
[1]	validation_0-rmse:4.75030	validation_1-rmse:4.75154
[2]	validation_0-rmse:3.33161	validation_1-rmse:3.33256
[3]	validation_0-rmse:2.33951	validation_1-rmse:2.33992
[4]	validation_0-rmse:1.64589	validation_1-rmse:1.64757
[5]	validation_0-rmse:1.16160	validation_1-rmse:1.16297
[6]	validation_0-rmse:0.82440	validation_1-rmse:0.82521
[7]	validation_0-rmse:0.59035	validation_1-rmse:0.59087
[8]	validation_0-rmse:0.42909	validation_1-rmse:0.43200
[9]	validation_0-rmse:0.32003	validation_1-rmse:0.32387
[10]	validation_0-rmse:0.24715	validation_1-rmse:0.25288
[11]	validation_0-rmse:0.20132	validation_1-rmse:0.20891
[12]	validation_0-rmse:0.17245	validation_1-rmse:0.18172
[13]	validation_0-rmse:0.15484	validation_1-rmse:0.16557
[14]	validation_0-rmse:0.14433	validation_1-rmse:0.15665
[15]	validation_0-rmse:0.13804	validation_1-rmse:0.15073
[16]	validation_0-rmse:0.13416	validation_1-rmse:0.14743
[17]	validation_0-rmse:0.13077	validation

##### Cross Validation

In [24]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5, shuffle=True, random_state=2)
xgb_reg_scores = cross_val_score(xgb_reg, X_train, y_train, cv=kfold)
print(xgb_reg_scores)
print(f'mean: {xgb_reg_scores.mean()}')

[0.98989054 0.99059773 0.98168083 0.98862814 0.98986858]
mean: 0.9881331627088121


NOTE: with that accuracy I think we don't need model improvement to better our score/rmse. Using xgboost is already a part of score improvement process. But I'll still put model improvement in below section only for the complement of the whole process

### **<p style="color:#4ceae5">Improve Model's Performance</p>**

##### Hyperparameter Tuning using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
                'n_estimators':[100],
                'max_depth': [6, 8],
                'learning_rate': [0.1, 0.2],
                'colsample_bytree': [0.7, 0.8],
                'subsample': [0.4, 0.6],
                'gamma': [0.1, 0.5],
                'min_child_weight': [4, 5],
}
reg = GridSearchCV(xgb_reg, parameters, n_jobs=-1)

gs = reg.fit(X_train, y_train)
clf = gs.best_estimator_

print(f'Test accuracy: {clf.score(X_test, y_test):.3f}')

##### Hyperparameter Tuning using Optuna

In [None]:
import optuna

def objective(trial, data=df_features, target=df_target):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)

    param = {
        'n_estimators': 200,  
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.7,0.8,0.9,1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
        'random_state': trial.suggest_categorical('random_state', [2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
    }

    model = XGBRegressor(**param)  
    model.fit(train_x, train_y, eval_set=[(test_x,test_y)], early_stopping_rounds=100, verbose=False)
    preds = model.predict(test_x)
    rmse = mean_squared_error(test_y, preds, squared=False)
    
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

##### Feature Selection

In [None]:
scores = [v for v in xgb_reg.feature_importances_]

df_features_importance = pd.DataFrame({'scores':scores, 'col_name':df_features.columns})
df_features_importance = df_features_importance.sort_values(by='scores', ascending=False)
df_features_importance

In [None]:
new_features = df_features_importance[df_features_importance['scores']>0]['col_name']
new_features

### Trying the model

In [25]:
data = {'make': 'toyota',
        'model': 'venza',
        'year': 6,
        'engine_fuel_type': 'regular_unleaded',
        'engine_hp': 268.0,
        'engine_cylinders': '6',
        'transmission_type': 'automatic',
        'driven_wheels': 'all_wheel_drive',
        'number_of_doors': '4',
        'market_category': 'crossover,performance',
        'vehicle_size': 'midsize',
        'vehicle_style': 'wagon',
        'highway_mpg': 25,
        'city_mpg': 18,
        'popularity': 2031}

df_new = pd.DataFrame([data])
df_new

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity
0,toyota,venza,6,regular_unleaded,268.0,6,automatic,all_wheel_drive,4,"crossover,performance",midsize,wagon,25,18,2031


In [26]:
# Transform the data
df_new[df_new.select_dtypes(include=['int', 'float']).columns] = np.log1p(df_new.select_dtypes(include=['int', 'float'])[0:1])
df_new

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity
0,toyota,venza,1.94591,regular_unleaded,5.594711,6,automatic,all_wheel_drive,4,"crossover,performance",midsize,wagon,3.258097,2.944439,7.616776


In [27]:
# do one-hot encoding and transpose the data
df_dummy = pd.get_dummies(df_new).T.reset_index().rename(columns={'index':'col_name', 0:'value'})
df_dummy

Unnamed: 0,col_name,value
0,year,1.94591
1,engine_hp,5.594711
2,highway_mpg,3.258097
3,city_mpg,2.944439
4,popularity,7.616776
5,make_toyota,1.0
6,model_venza,1.0
7,engine_fuel_type_regular_unleaded,1.0
8,engine_cylinders_6,1.0
9,transmission_type_automatic,1.0


In [28]:
# make a data that consist of all column name that enter the previous model training process
df_dummy1 = pd.DataFrame({'col_name':xgb_reg.feature_names_in_})
df_dummy1

Unnamed: 0,col_name
0,year
1,engine_hp
2,highway_mpg
3,city_mpg
4,popularity
...,...
1085,vehicle_style_passenger_minivan
1086,vehicle_style_passenger_van
1087,vehicle_style_regular_cab_pickup
1088,vehicle_style_sedan


In [29]:
# merge the transposed data and the "column data" and fill tha NaN value with 0
df_dummy2 = df_dummy1.merge(df_dummy, on='col_name', how='outer').fillna(0)
df_dummy2

Unnamed: 0,col_name,value
0,year,1.945910
1,engine_hp,5.594711
2,highway_mpg,3.258097
3,city_mpg,2.944439
4,popularity,7.616776
...,...,...
1085,vehicle_style_passenger_minivan,0.000000
1086,vehicle_style_passenger_van,0.000000
1087,vehicle_style_regular_cab_pickup,0.000000
1088,vehicle_style_sedan,0.000000


In [30]:
# inverse the prediction value because it's still a log value
np.expm1(xgb_reg.predict(df_dummy2.set_index('col_name').T.values))

array([35343.08], dtype=float32)

### **<p style="color:#4ceae5">Save The Model</p>**

In [None]:
import pickle

pickle.dump(xgb_reg, open("Models/carprice_prediction.pkl", "wb"))
