In [57]:
import pandas as pd 
import numpy as np
import os
# Scikit-Learn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, r2_score
# XgBoost
from xgboost import XGBRegressor


import warnings 
warnings.filterwarnings('ignore')

In [34]:
DATA_DIR_iterim = '../data/iterim/'
df = pd.read_csv(os.path.join(DATA_DIR_iterim, 'bikes-03-no_outliers.csv'))
df.columns

Index(['brand_name', 'model_name', 'motor_size', 'model_year', 'kms_driven',
       'mileage', 'owner', 'power', 'price'],
      dtype='object')

In [54]:
# Split dataset in features and target

X = df.drop(['price'], axis=1)
y = df['price']

In [55]:
# Split in train and test data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) 

In [60]:
cat_features = ['brand_name', 'model_name']
cat_transformer = OneHotEncoder(handle_unknown='ignore')

num_features = ['motor_size', 'model_year', 'kms_driven',
                'mileage', 'owner', 'power']
num_transformer = PowerTransformer()

column_transform = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features)
    ]
)

In [62]:
pipe = Pipeline([("col_trans", column_transform),
                 ("regressor", RandomForestRegressor())])

grid_param = [
                # {
                #     "regressor": [LinearRegression()]                 
                # },
                # {
                #     "regressor": [Lasso()],
                #     "regressor__alpha": [np.linspace(0, 1, 4)]
                # },
                # {
                #     "regressor": [Ridge()],
                #     "regressor__alpha": [np.linspace(0, 1, 4)]
                # },
                {
                    "regressor": [RandomForestRegressor()],
                    #  "regressor__n_estimators": [10, 100],
                    #  "regressor__max_depth": [3, 15, 30, None],
                    #  "regressor__min_samples_leaf": [1, 10, 20],
                #     "regressor__max_leaf_nodes": [2, 5, 10, None]
                },
                {
                    "regressor": [XGBRegressor()],
                #    "regressor__max_depth": [3, 15, 30, None]
                }
]

gridsearch = GridSearchCV(pipe, grid_param, cv=6, verbose=0, n_jobs=-1)
best_model = gridsearch.fit(X_train, y_train) 

In [63]:
pred = best_model.score(X_test, y_test)
pred

0.9375875645345649

In [64]:
pipe = Pipeline([("col_trans", column_transform),
                 ("regressor", RandomForestRegressor())])
pipe.fit(X_train, y_train)

Pipeline(steps=[('col_trans',
                 ColumnTransformer(transformers=[('num', PowerTransformer(),
                                                  ['motor_size', 'model_year',
                                                   'kms_driven', 'mileage',
                                                   'owner', 'power']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['brand_name',
                                                   'model_name'])])),
                ('regressor', RandomForestRegressor())])

In [65]:
pipe.score(X_test, y_test)

0.9384969679871054

In [66]:
y_pred = pipe.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mse**(1/2) * 0.0132


368.96951043910997

In [67]:
pipe2 = Pipeline([("col_trans", column_transform),
                  ("regressor", XGBRegressor())])
pipe2.fit(X_train, y_train)

Pipeline(steps=[('col_trans',
                 ColumnTransformer(transformers=[('num', PowerTransformer(),
                                                  ['motor_size', 'model_year',
                                                   'kms_driven', 'mileage',
                                                   'owner', 'power']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['brand_name',
                                                   'model_name'])])),
                ('regressor',
                 XGBRegressor(base_score=0.5, booster='gbtree',
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1, g..., gpu_id=-1,
                              importance_type='gain',
                              interaction_constraints='',
                              learning_ra

In [68]:
pipe2.score(X_test, y_test)

0.9357580573692571

In [69]:
y_pred = pipe2.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mse**(1/2) * 0.0132


377.09567325826384

In [70]:
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mse**(1/2) * 0.0132


371.6873510374974