In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler, PolynomialFeatures

In [2]:
df = pd.read_csv("../../datasets/CarPrice.csv", index_col="car_ID")
df.drop(labels=['CarName'], axis=1, inplace=True)
df.head()

Unnamed: 0_level_0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
car_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
2,3,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
3,1,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
4,2,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
5,2,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


## splitting

In [3]:
X = df.drop(labels=['price'], axis=1)
y = df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((164, 23), (41, 23), (164,), (41,))

## training

In [4]:
numerical_pipeline = Pipeline([
    ("inputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])
categorical_pipeline = Pipeline([
    ("inputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown = 'ignore'))
])


In [5]:
X_train.head()

Unnamed: 0_level_0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
car_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
67,0,diesel,std,four,sedan,rwd,front,104.9,175.0,66.1,...,four,134,idi,3.43,3.64,22.0,72,4200,31,39
112,0,gas,std,four,sedan,rwd,front,107.9,186.7,68.4,...,four,120,mpfi,3.46,2.19,8.4,95,5000,19,24
154,0,gas,std,four,wagon,fwd,front,95.7,169.7,63.6,...,four,92,2bbl,3.05,3.03,9.0,62,4800,31,37
97,1,gas,std,four,sedan,fwd,front,94.5,165.3,63.8,...,four,97,2bbl,3.15,3.29,9.4,69,5200,31,37
39,0,gas,std,two,hatchback,fwd,front,96.5,167.5,65.2,...,four,110,1bbl,3.15,3.58,9.0,86,5800,27,33


In [6]:
preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline, ['wheelbase','enginesize','boreratio','stroke','compressionratio','horsepower','peakrpm','citympg','highwaympg']),
    ("categoric", categorical_pipeline, ['symboling','fueltype','aspiration','doornumber','carbody','drivewheel','enginelocation','enginetype','cylindernumber','fuelsystem'])
])

In [7]:
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", SVR())
])

In [8]:
# pipeline.get_params()

In [9]:
from sklearn.model_selection import GridSearchCV

parameter = {
    'algo__C' : [1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10],
    'algo__gamma' : ('auto','scale'),
},

model = GridSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

Fitting 3 folds for each of 38 candidates, totalling 114 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('inputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['wheelbase',
                                                                          'enginesize',
                                                                          'boreratio',
                                                                          'stroke',
                                                                          'compress

In [10]:
model.best_params_

{'algo__C': 10, 'algo__gamma': 'scale'}

In [11]:
model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test)

(-0.07937571040183222, -0.10794518260572539, -0.0760196784433449)

## ga cocok. ga bagus

### polynomial features (pangkat linear)

In [14]:
numerical_pipeline = Pipeline([
    ("inputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
    ("poly",PolynomialFeatures())
])
categorical_pipeline = Pipeline([
    ("inputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown = 'ignore'))
])

preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline, ['wheelbase','enginesize','boreratio','stroke','compressionratio','horsepower','peakrpm','citympg','highwaympg']),
    ("categoric", categorical_pipeline, ['symboling','fueltype','aspiration','doornumber','carbody','drivewheel','enginelocation','enginetype','cylindernumber','fuelsystem'])
])
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", SVR())
])
parameter = {
    'algo__C' : [1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10],
    'algo__gamma' : ('auto','scale'),
    'prep__numeric__poly__degree' : [1,2],
    'prep__numeric__poly__interaction_only' : [True, False],
},

model = GridSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

Fitting 3 folds for each of 152 candidates, totalling 456 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('inputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler()),
                                                                                         ('poly',
                                                                                          PolynomialFeatures())]),
                                                                         ['wheelbase',
                                                                          'enginesize',
                                           

In [15]:
model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test)

(-0.07905280864589126, -0.10772174851321854, -0.07565646254752334)

# save model

In [17]:
import pickle

In [18]:
pickle.dump(model, open("../models/svm_carprice.pkl",'wb'))

In [20]:
model = pickle.load(open("../models/svm_carprice.pkl", "rb"))
model

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('inputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler()),
                                                                                         ('poly',
                                                                                          PolynomialFeatures())]),
                                                                         ['wheelbase',
                                                                          'enginesize',
                                           