In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np
np.random.seed(1)

In [2]:
df = pd.read_csv('car-sales-extended.csv')

In [3]:
df.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Make           1000 non-null   object
 1   Colour         1000 non-null   object
 2   Odometer (KM)  1000 non-null   int64 
 3   Doors          1000 non-null   int64 
 4   Price          1000 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 39.2+ KB


In [5]:
cat_features = ['Make', 'Colour', 'Doors']

In [6]:
X = df.drop('Price', axis=1)

In [7]:
X.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [8]:
y = df.Price

In [9]:
y.head()

0    15323
1    19943
2    28343
3    13434
4    14043
Name: Price, dtype: int64

In [10]:
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, cat_features)],
                                remainder='passthrough')

In [11]:
transformed_X = transformer.fit_transform(X)

In [12]:
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [13]:
pd.DataFrame(transformed_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [14]:
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

In [15]:
reg = RandomForestRegressor()

In [16]:
reg.fit(X_train, y_train)

RandomForestRegressor()

In [17]:
reg.score(X_test, y_test)

0.31207113535105824

# Missing Values Handling

In [18]:
df_missing = pd.read_csv('car-sales-extended-missing-data.csv')

In [19]:
df_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [20]:
df_missing.dropna(subset='Price', inplace=True)

In [21]:
df_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [22]:
X = df_missing.drop('Price', axis = 1)
y = df_missing.Price

In [23]:
X.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431.0,4.0
1,BMW,Blue,192714.0,5.0
2,Honda,White,84714.0,4.0
3,Toyota,White,154365.0,4.0
4,Nissan,Blue,181577.0,3.0


In [24]:
X.shape

(950, 4)

In [25]:
y.shape

(950,)

In [26]:
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
door_imputer = SimpleImputer(strategy='constant', fill_value=4)
num_imputer = SimpleImputer(strategy='mean')
cat_features = ['Make', 'Colour']
door_feature = ['Doors']
num_feature = ['Odometer (KM)']

In [27]:
imputer = ColumnTransformer([('cat_imputer', cat_imputer, cat_features),
                             ('door_imputer', door_imputer, door_feature),
                             ('num_imputer', num_imputer, num_feature)])

In [28]:
filled_X = imputer.fit_transform(X)

In [29]:
filled_X

array([['Honda', 'White', 4.0, 35431.0],
       ['BMW', 'Blue', 5.0, 192714.0],
       ['Honda', 'White', 4.0, 84714.0],
       ...,
       ['Nissan', 'Blue', 4.0, 66604.0],
       ['Honda', 'White', 4.0, 215883.0],
       ['Toyota', 'Blue', 4.0, 248360.0]], dtype=object)

In [30]:
X.columns

Index(['Make', 'Colour', 'Odometer (KM)', 'Doors'], dtype='object')

In [31]:
filled_X = pd.DataFrame(data=filled_X, columns=('Make', 'Colour', 'Doors', 'Odometer (KM)'))

In [32]:
filled_X.head()

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,4.0,35431.0
1,BMW,Blue,5.0,192714.0
2,Honda,White,4.0,84714.0
3,Toyota,White,4.0,154365.0
4,Nissan,Blue,3.0,181577.0


In [33]:
filled_X.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [34]:
cat_features = ['Make', 'Colour', 'Doors']

In [35]:
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, cat_features)],
                                remainder='passthrough')

In [36]:
transformed_X = transformer.fit_transform(filled_X)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

In [38]:
reg = RandomForestRegressor()

In [39]:
reg.fit(X_train, y_train)

RandomForestRegressor()

In [40]:
reg.score(X_test, y_test)

0.2998773752984115

In [41]:
y_pred = reg.predict(X_test)

In [42]:
mean_absolute_error(y_pred, y_test)

5441.7080113491775

In [43]:
reg.score(X_test, y_test)

0.2998773752984115

In [44]:
r2_score(y_test, y_pred)

0.2998773752984115

In [45]:
mae = mean_absolute_error(y_test, y_pred)

In [46]:
mae

5441.7080113491775

In [47]:
mse = mean_squared_error(y_test, y_pred)

In [48]:
mse

45829629.00865266

In [49]:
# CV for R2
np.mean(cross_val_score(reg, transformed_X, y, cv=5, scoring=None))

0.2187677110252931

In [50]:
np.mean(cross_val_score(reg, transformed_X, y, cv=5, scoring='neg_mean_squared_error'))

-57047657.9542609

In [51]:
np.mean(cross_val_score(reg, transformed_X, y, cv=5, scoring='neg_mean_absolute_error'))

-5938.162074508021

# Using Pipelines

In [73]:
data = pd.read_csv('car-sales-extended-missing-data.csv')

In [74]:
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [75]:
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [76]:
data.dropna(subset=['Price'], inplace=True)

In [77]:
data.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [78]:
cat_features = ['Make', 'Colour']

In [79]:
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [80]:
door_feature = ['Doors']

In [81]:
door_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=4))
])

In [82]:
numeric_features = ['Odometer (KM)']

In [83]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

In [84]:
preprocessor = ColumnTransformer(transformers=[
    ('cat', cat_transformer, cat_features),
    ('door', door_transformer, door_feature),
    ('num', numeric_transformer, numeric_features)
])

In [85]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor()),
])

In [86]:
X = data.drop('Price', axis=1)
y = data['Price']

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [88]:
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Make', 'Colour']),
                                                 ('door',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value=4,
                                                                                 strategy='constant'))]),
                              

In [89]:
model.score(X_test, y_test)

0.14610559258440237

# Using Pipeline with GridSearchCV and RandomSearchCV

In [69]:
pipe_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'model__n_estimators': [100, 1000],
    'model__max_depth': [None, 5],
    'model__max_features': ['auto'],
    'model__min_samples_split': [2, 4]
}

In [70]:
gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2)

In [71]:
gs_model.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=auto, model__min_sampl

[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time=   0.0s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time=   0.0s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time=   0.7s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time=   0.6s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strategy=mean; total time=   0.7s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=2, model__n_estimators=1000, preprocessor__num__imputer__strate

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(fill_value='missing',
                                                                                                        strategy='constant')),
                                                                                         ('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['Make',
                                                                          'Colour']),
                                                                        ('door',
         

In [72]:
gs_model.score(X_test, y_test)

0.3898925148200737