In [1]:
#importing library
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
np.random.seed(45)

#importing dataset
data = pd.read_csv('Data/car-sales-extended-missing-data.csv')

data.dropna(subset=['Price'],inplace=True)
print(data.head())

#defining parameters(both missing data and encoding)
Categorical_parameter = ['Make','Colour']
Categorical_coder = Pipeline(steps=[('imputer',SimpleImputer(strategy='constant',fill_value='Missing')),
                                    ('one_hot',OneHotEncoder(handle_unknown='ignore'))
                                    ])
Numerical_parameter = ['Odometer (KM)']
Numerical_coder = Pipeline(steps=[('imputer',SimpleImputer(strategy='mean'))
                                  ])
Door_parameter = ['Doors']
Door_coder = Pipeline(steps=[('imputer',SimpleImputer(strategy='constant',fill_value=4))
                             ])

#imputing and tranforming the defined parameters
Preprocessor = ColumnTransformer(transformers=[
                                               ('cate_par',Categorical_coder,Categorical_parameter),
                                               ('nume_par',Numerical_coder,Numerical_parameter),
                                               ('door_par',Door_coder,Door_parameter)
                                              ],
                                              remainder='passthrough')

#transformation and modelling
modelling = Pipeline(steps=[('Preprocessor',Preprocessor),
                                 ('RandomForest',RandomForestRegressor())
                                 ])

#splitting dataset into X and Y
X = data.drop(['Price'],axis=1)
Y = data['Price']

#splitting dataset into train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2)

#traing machine with training data
modelling.fit(X_train,Y_train)

#prediction and scoring
#print(modelling.predict(X_test))
modelling.score(X_test,Y_test)


     Make Colour  Odometer (KM)  Doors    Price
0   Honda  White        35431.0    4.0  15323.0
1     BMW   Blue       192714.0    5.0  19943.0
2   Honda  White        84714.0    4.0  28343.0
3  Toyota  White       154365.0    4.0  13434.0
4  Nissan   Blue       181577.0    3.0  14043.0


0.29461569831729795

In [2]:
#improving result using GridSearchCV
grid = {
    'Preprocessor__nume_par__imputer__strategy':['mean','median'],
    'RandomForest__min_samples_split': [2, 4],
    'RandomForest__max_depth': [None, 2]
}
GSC = GridSearchCV(modelling,
                   param_grid=grid,
                   cv=5,
                   verbose=2)
GSC.fit(X_train,Y_train)
scorer = GSC.score(X_test,Y_test)
scorer


Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] Preprocessor__nume_par__imputer__strategy=mean, RandomForest__max_depth=None, RandomForest__min_samples_split=2 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  Preprocessor__nume_par__imputer__strategy=mean, RandomForest__max_depth=None, RandomForest__min_samples_split=2, total=   0.6s
[CV] Preprocessor__nume_par__imputer__strategy=mean, RandomForest__max_depth=None, RandomForest__min_samples_split=2 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  Preprocessor__nume_par__imputer__strategy=mean, RandomForest__max_depth=None, RandomForest__min_samples_split=2, total=   0.5s
[CV] Preprocessor__nume_par__imputer__strategy=mean, RandomForest__max_depth=None, RandomForest__min_samples_split=2 
[CV]  Preprocessor__nume_par__imputer__strategy=mean, RandomForest__max_depth=None, RandomForest__min_samples_split=2, total=   0.5s
[CV] Preprocessor__nume_par__imputer__strategy=mean, RandomForest__max_depth=None, RandomForest__min_samples_split=2 
[CV]  Preprocessor__nume_par__imputer__strategy=mean, RandomForest__max_depth=None, RandomForest__min_samples_split=2, total=   0.5s
[CV] Preprocessor__nume_par__imputer__strategy=mean, RandomForest__max_depth=None, RandomForest__min_samples_split=2 
[CV]  Preprocessor__nume_par__imputer__strategy=mean, RandomForest__max_depth=None, RandomForest__min_samples_split=2, total=   0.5s
[CV] Preprocessor__nume_par__imputer__strategy=mean, RandomForest__max_depth=None, RandomForest__min_samples_split

[CV]  Preprocessor__nume_par__imputer__strategy=median, RandomForest__max_depth=2, RandomForest__min_samples_split=2, total=   0.4s
[CV] Preprocessor__nume_par__imputer__strategy=median, RandomForest__max_depth=2, RandomForest__min_samples_split=4 
[CV]  Preprocessor__nume_par__imputer__strategy=median, RandomForest__max_depth=2, RandomForest__min_samples_split=4, total=   0.3s
[CV] Preprocessor__nume_par__imputer__strategy=median, RandomForest__max_depth=2, RandomForest__min_samples_split=4 
[CV]  Preprocessor__nume_par__imputer__strategy=median, RandomForest__max_depth=2, RandomForest__min_samples_split=4, total=   0.3s
[CV] Preprocessor__nume_par__imputer__strategy=median, RandomForest__max_depth=2, RandomForest__min_samples_split=4 
[CV]  Preprocessor__nume_par__imputer__strategy=median, RandomForest__max_depth=2, RandomForest__min_samples_split=4, total=   0.3s
[CV] Preprocessor__nume_par__imputer__strategy=median, RandomForest__max_depth=2, RandomForest__min_samples_split=4 
[CV]

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:   17.4s finished


0.38367188368769367

In [5]:
GSC.best_params_

{'Preprocessor__nume_par__imputer__strategy': 'median',
 'RandomForest__max_depth': 2,
 'RandomForest__min_samples_split': 4}