In [90]:
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import sklearn
import joblib

from collections import defaultdict

from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from yellowbrick.model_selection import ValidationCurve

%config InlineBackend.figure_format = 'retina'

In [13]:
df = pd.read_csv('../datasets/autoria/autoria_encoded_data.csv', index_col=0)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129292 entries, 0 to 146813
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   brand              129292 non-null  int64  
 1   fuel_type          129292 non-null  int64  
 2   transmission_type  129292 non-null  int64  
 3   price_USD          129292 non-null  int64  
 4   mileage_kkm        129292 non-null  int64  
 5   year_made          129292 non-null  int64  
 6   engine_size        129292 non-null  float64
dtypes: float64(1), int64(6)
memory usage: 7.9 MB


### Random Forest

In [15]:
y = df['price_USD']
X = df.drop(columns=['price_USD'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
X_train

Unnamed: 0,brand,fuel_type,transmission_type,mileage_kkm,year_made,engine_size
119142,72,5,0,175,2015,2.5
119249,6,5,0,60,2017,3.0
5885,35,1,0,92,2012,0.0
26768,67,3,0,203,2007,3.5
110819,74,5,3,117,2007,5.4
...,...,...,...,...,...,...
145400,128,3,1,42,2002,1.7
117562,60,0,1,136,2007,2.5
928,22,5,0,0,2021,1.2
17570,46,5,0,120,2008,2.4


In [101]:
rfr = RandomForestRegressor(
    n_estimators=100,
    max_depth=14,
    random_state=1,
    n_jobs=8,
)
rfr.fit(X_train, y_train)
print('score:', rfr.score(X_test, y_test))

score: 0.8992945045946545


In [102]:
y_pred = rfr.predict(X_test)

In [103]:
sklearn.metrics.mean_absolute_percentage_error(y_test, y_pred)

0.1690812819472468

#### Saving model

In [104]:
model_path = 'models/tab_price_0/rfr_model.sav'
joblib.dump(rfr, model_path)

['models/tab_price_0/rfr_model.sav']

#### Inference example

In [105]:
loaded_rfr = joblib.load(model_path)

In [106]:
feature_encoding_path = '../datasets/autoria/feature_encoding.pkl'
with open(feature_encoding_path, 'rb') as f:
    encoding = pickle.load(f)

In [117]:
data = {
    'brand' : 'BMW',
    'fuel_type' : 'petrol',
    'transmission_type' : 'automatic',
    'mileage_kkm' : 145,
    'year_made' : 2010,
    'engine_size' : 4.4,
}

In [118]:
def encode_features(features, encoding):
    for f, val in features.items():
        val = val.lower() if isinstance(val, str) else val
        if f in encoding:
            features[f] = encoding[f].transform([val])

In [119]:
encode_features(data, encoding)

In [120]:
data

{'brand': array([11]),
 'fuel_type': array([5]),
 'transmission_type': array([0]),
 'mileage_kkm': 145,
 'year_made': 2010,
 'engine_size': 4.4}

In [111]:
encoded_data = [list(data.values())]
encoded_data

[[array([11]), array([5]), array([0]), 145, 2010, 4.4]]

In [112]:
loaded_rfr.predict(encoded_data)

array([21834.36676397])