In [5]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor()

np.random.seed(45)
data = pd.read_csv('Data/car-sales-extended-missing-data.csv')

data.dropna(subset=['Price'],inplace=True)

X = data.drop(['Price'],axis=1)
Y = data['Price']
#print(data.isna().sum())

#defining FEature(both missing data and encoding)
Categorical_features = ['Make','Colour']
Numerical_features = ['Odometer (KM)']
Doors_features = ['Doors']

#Encoding feature
imputer_categorical = SimpleImputer(strategy='constant',fill_value='Missing')
imputer_numerical = SimpleImputer(strategy='mean')
imputer_door = SimpleImputer(strategy='constant',fill_value=4)

CT_imputer = ColumnTransformer(transformers=[('imputer_categorical',imputer_categorical,Categorical_features),
                                             ('imputer_numerical',imputer_numerical,Numerical_features),
                                             ('imputer_door',imputer_door,Doors_features)
                                             ])


one_hot = OneHotEncoder()
CT_one_hot = ColumnTransformer(transformers=[('one_hot',OneHotEncoder(categories='auto'),Categorical_features)],
                               remainder='passthrough')

data_new = CT_imputer.fit_transform(data)
df = DataFrame(data_new,columns=['Make','Colour','Odometer (KM)','Doors'])
df = CT_one_hot.fit_transform(df)

#Splitting data into train_test_split and modelling
X_train,X_test,Y_train,Y_test = train_test_split(df,Y,test_size=0.2)

RFR.fit(X_train,Y_train)




RandomForestRegressor()

In [6]:
np.random.seed(45)
Ypred = RFR.predict(X_test)
RFR.score(X_test,Y_test)

0.2969869083504645

In [18]:
RFRR = RandomForestRegressor(n_jobs=1)
grid = {
        'max_depth':[None],
        'min_samples_split':[2,4],
        'min_samples_leaf':[1,2,3],
        'max_features':['auto'],
        }


In [19]:
GSC = GridSearchCV(
             estimator=RFRR,
             param_grid=grid,
             cv=5,
             verbose=2,
)


In [20]:
GSC.fit(X_train,Y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, total=   0.7s
[CV] max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV]  max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, total=   0.6s
[CV] max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, total=   0.5s
[CV] max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, total=   0.5s
[CV] max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, total=   0.5s
[CV] max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=4 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=4, total=   0.4s
[CV] max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=4 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=4, total=   0.4s
[CV] max_depth=None

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   13.4s finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(n_jobs=1),
             param_grid={'max_depth': [None], 'max_features': ['auto'],
                         'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [2, 4]},
             verbose=2)

In [21]:
GSC.best_estimator_

RandomForestRegressor(min_samples_leaf=3, n_jobs=1)

In [22]:
GSC.best_params_

{'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 3,
 'min_samples_split': 2}

In [23]:
GSC.best_score_

0.1880095079891496

In [24]:
Ypred = GSC.predict(X_test)

In [25]:
GSC.score(X_test,Y_test)

0.3713091671005393