## import package

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor


## import dataset + cleaning

In [2]:
df = pd.read_csv("../../datasets/CarPrice.csv", index_col="car_ID")
df.drop(labels=['CarName'], axis=1, inplace=True)
df.head()

Unnamed: 0_level_0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
car_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
2,3,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
3,1,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
4,2,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
5,2,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


## data splitting

In [3]:
X = df.drop(labels=['price'], axis=1)
y = df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((164, 23), (41, 23), (164,), (41,))

In [4]:
numerical_pipeline = Pipeline([
    ("inputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])
categorical_pipeline = Pipeline([
    ("inputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown = 'ignore'))
])


In [5]:
preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline, ['wheelbase','enginesize','boreratio','stroke','compressionratio','horsepower','peakrpm','citympg','highwaympg']),
    ("categoric", categorical_pipeline, ['symboling','fueltype','aspiration','doornumber','carbody','drivewheel','enginelocation','enginetype','cylindernumber','fuelsystem'])
])

In [6]:
pipeline = Pipeline([
    ("prep", preprocessor),
    ("algo", KNeighborsRegressor())
])

In [7]:
# pipeline.get_params()

In [8]:
parameter = {
    "algo__n_neighbors": range(1, 51, 2),
    "algo__weights": ['uniform','distance'],
    "algo__p": [1,2]
},

model = GridSearchCV(pipeline, parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('prep',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('inputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['wheelbase',
                                                                          'enginesize',
                                                                          'boreratio',
                                                                          'stroke',
                                                                          'compress

In [9]:
model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test)

(0.9940424799018482, 0.7831089297934758, 0.7971842626816532)

## export and import model

In [10]:
import pickle

In [11]:
pickle.dump(model, open("../models/knnr_carprice.pkl",'wb'))

In [12]:
model = pickle.load(open("../models/knnr_carprice.pkl", "rb"))

## predict

In [16]:
# X.info()

In [14]:
data = [
    [0,"gas","turbo","two","hatchback","4wd","front",99.5,178.2,67.9,52,3053,"ohc","five",131,"mpfi",3.13,3.4,7,160,5500,16,22],
    [0,"diesel","std","two","hatchback","rwd","front",97.5,160.2,65,52,3000,"ohc","four",130,"mpfi",3.1,2.98,7,140,5300,16,22],
]
X_pred = pd.DataFrame(data, index=['satu', 'dua'], columns=X.columns)
X_pred

Unnamed: 0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,...,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
satu,0,gas,turbo,two,hatchback,4wd,front,99.5,178.2,67.9,...,five,131,mpfi,3.13,3.4,7,160,5500,16,22
dua,0,diesel,std,two,hatchback,rwd,front,97.5,160.2,65.0,...,four,130,mpfi,3.1,2.98,7,140,5300,16,22


In [15]:
model.predict(X_pred)

array([18221.26173805, 15246.13707331])