<a href="https://colab.research.google.com/github/bpandey369/Feature-Engineering/blob/main/sklearn_column_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('/content/cars.csv')

In [None]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [None]:
df['owner'].value_counts()

First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: owner, dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
                                                      df.drop(columns=['selling_price']),
                                                      df['selling_price'],
                                                      test_size=0.2,
                                                      random_state=42
                                                    )

### The Hard Way!

In [None]:
# apply ordinal encoder to owner
oe = OrdinalEncoder(categories=[['Test Drive Car', 'Fourth & Above Owner', 'Third Owner', 'Second Owner', 'First Owner']])
oe.set_output(transform='pandas')
X_train_owner = oe.fit_transform(X_train.loc[:,['owner']])
X_test_owner = oe.transform(X_test.loc[:,['owner']])

In [None]:
X_train_owner

Unnamed: 0,owner
6518,4.0
6144,3.0
6381,1.0
438,3.0
5939,4.0
...,...
5226,4.0
5390,3.0
860,4.0
7603,4.0


In [None]:
# apply ohe to brand and fuel
ohe = OneHotEncoder(sparse_output=False)
ohe.set_output(transform='pandas')
X_train_brand_fuel = ohe.fit_transform(X_train[['brand','fuel']])
X_test_brand_fuel = ohe.transform(X_test[['brand','fuel']])

In [None]:
X_train_brand_fuel

Unnamed: 0,brand_Ambassador,brand_Ashok,brand_Audi,brand_BMW,brand_Chevrolet,brand_Daewoo,brand_Datsun,brand_Fiat,brand_Force,brand_Ford,...,brand_Renault,brand_Skoda,brand_Tata,brand_Toyota,brand_Volkswagen,brand_Volvo,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol
6518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
860,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
X_train_rem = X_train.drop(columns=['brand','fuel','owner'],inplace=True)
X_test_rem = X_test.drop(columns=['brand','fuel','owner'],inplace=True)

In [None]:
X_train.head()

Unnamed: 0,km_driven
6518,2560
6144,80000
6381,150000
438,120000
5939,25000


### The Easy Way!

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
df = pd.read_csv('cars.csv')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
                                                      df.drop(columns=['selling_price']),
                                                      df['selling_price'],
                                                      test_size=0.2,
                                                      random_state=42
                                                    )

In [None]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
6518,Tata,2560,Petrol,First Owner
6144,Honda,80000,Petrol,Second Owner
6381,Hyundai,150000,Diesel,Fourth & Above Owner
438,Maruti,120000,Diesel,Second Owner
5939,Maruti,25000,Petrol,First Owner


In [None]:
transformer = ColumnTransformer(
    [
        ("ordinal", OrdinalEncoder(categories=[['Test Drive Car', 'Fourth & Above Owner', 'Third Owner', 'Second Owner', 'First Owner']]), ['owner']),
        ("onehot", OneHotEncoder(sparse_output=False), ['brand', 'fuel'])
    ],
    remainder='passthrough'
)


In [None]:
transformer

In [None]:
X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)

In [None]:
pd.DataFrame(X_train_transformed, columns=transformer.get_feature_names_out())

# Check if the issue persists

Unnamed: 0,ordinal__owner,onehot__brand_Ambassador,onehot__brand_Ashok,onehot__brand_Audi,onehot__brand_BMW,onehot__brand_Chevrolet,onehot__brand_Daewoo,onehot__brand_Datsun,onehot__brand_Fiat,onehot__brand_Force,...,onehot__brand_Skoda,onehot__brand_Tata,onehot__brand_Toyota,onehot__brand_Volkswagen,onehot__brand_Volvo,onehot__fuel_CNG,onehot__fuel_Diesel,onehot__fuel_LPG,onehot__fuel_Petrol,remainder__km_driven
0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2560.0
1,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,80000.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,150000.0
3,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,120000.0
4,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,25000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6497,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,120000.0
6498,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,80000.0
6499,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,35000.0
6500,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,27000.0


In [None]:
transformer.feature_names_in_

array(['brand', 'km_driven', 'fuel', 'owner'], dtype=object)

In [None]:
transformer.get_feature_names_out()

array(['ordinal__owner', 'onehot__brand_Ambassador',
       'onehot__brand_Ashok', 'onehot__brand_Audi', 'onehot__brand_BMW',
       'onehot__brand_Chevrolet', 'onehot__brand_Daewoo',
       'onehot__brand_Datsun', 'onehot__brand_Fiat',
       'onehot__brand_Force', 'onehot__brand_Ford', 'onehot__brand_Honda',
       'onehot__brand_Hyundai', 'onehot__brand_Isuzu',
       'onehot__brand_Jaguar', 'onehot__brand_Jeep', 'onehot__brand_Kia',
       'onehot__brand_Land', 'onehot__brand_Lexus', 'onehot__brand_MG',
       'onehot__brand_Mahindra', 'onehot__brand_Maruti',
       'onehot__brand_Mercedes-Benz', 'onehot__brand_Mitsubishi',
       'onehot__brand_Nissan', 'onehot__brand_Opel',
       'onehot__brand_Peugeot', 'onehot__brand_Renault',
       'onehot__brand_Skoda', 'onehot__brand_Tata',
       'onehot__brand_Toyota', 'onehot__brand_Volkswagen',
       'onehot__brand_Volvo', 'onehot__fuel_CNG', 'onehot__fuel_Diesel',
       'onehot__fuel_LPG', 'onehot__fuel_Petrol', 'remainder__km_drive

In [None]:
transformer.n_features_in_

4

In [None]:
transformer.named_transformers_

{'ordinal': OrdinalEncoder(categories=[['Test Drive Car', 'Fourth & Above Owner',
                             'Third Owner', 'Second Owner', 'First Owner']]),
 'onehot': OneHotEncoder(sparse_output=False),
 'remainder': 'passthrough'}

In [None]:
transformer.output_indices_

{'ordinal': slice(0, 1, None),
 'onehot': slice(1, 37, None),
 'remainder': slice(37, 38, None)}

In [None]:
import joblib

joblib.dump(transformer,'transformer.joblib')

['transformer.joblib']

In [None]:
# import the transformer

transformer_new = joblib.load('transformer.joblib')
transformer_new

### Sklearn Pipeline

In [None]:
df = pd.read_csv('cars.csv')
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [None]:
df.shape

(8128, 5)

In [None]:
import numpy as np

np.random.seed(42)
missing_km_indices = np.random.choice(df.index, size=int(0.05*len(df)), replace=False)
df.loc[missing_km_indices, 'km_driven'] = np.nan

# Introduce missing values in 'owner' column (1% missing values)
missing_owner_indices = np.random.choice(df.index, size=int(0.01*len(df)), replace=False)
df.loc[missing_owner_indices, 'owner'] = np.nan



In [None]:
df.isnull().sum()

brand              0
km_driven        406
fuel               0
owner             81
selling_price      0
dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
                                                      df.drop(columns=['selling_price']),
                                                      df['selling_price'],
                                                      test_size=0.2,
                                                      random_state=42
                                                    )

In [None]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
6518,Tata,2560.0,Petrol,First Owner
6144,Honda,80000.0,Petrol,Second Owner
6381,Hyundai,150000.0,Diesel,Fourth & Above Owner
438,Maruti,120000.0,Diesel,Second Owner
5939,Maruti,25000.0,Petrol,First Owner


In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6502 entries, 6518 to 7270
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   brand      6502 non-null   object 
 1   km_driven  6502 non-null   float64
 2   fuel       6502 non-null   object 
 3   owner      6442 non-null   object 
dtypes: float64(1), object(3)
memory usage: 254.0+ KB


In [None]:
# Plan of Attack

# Missing value imputation
# Encoding Categorical Variables
# Scaling
# Feature Selection
# Model building
# Prediction

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest,chi2

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

### Diff Approach

In [None]:
owner_pipeline = Pipeline(steps=[
        ('impute',SimpleImputer(strategy='most_frequent')),
        ('encode',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1))
])

In [None]:
km_pipeline = Pipeline(steps=[
        ('impute_km_driven',SimpleImputer()),
        ('scale',StandardScaler())
])

In [None]:
transformer = ColumnTransformer(transformers=[
    ('owner_pipe',owner_pipeline,['owner']),
    ('ohe', OneHotEncoder(sparse_output=False,handle_unknown='ignore'),['brand','fuel']),
    ('km_pipe',km_pipeline,['km_driven'])
])

In [None]:
model_pipe = Pipeline(steps=[
    ('transformer',transformer),
    ('model',RandomForestRegressor())
])

In [None]:
model_pipe

In [None]:
model_pipe.fit(X_train, y_train)

In [None]:
model_pipe.named_steps

{'transformer': ColumnTransformer(transformers=[('owner_pipe',
                                  Pipeline(steps=[('impute',
                                                   SimpleImputer(strategy='most_frequent')),
                                                  ('encode',
                                                   OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                  unknown_value=-1))]),
                                  ['owner']),
                                 ('ohe',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  ['brand', 'fuel']),
                                 ('km_pipe',
                                  Pipeline(steps=[('impute_km_driven',
                                                   SimpleImputer()),
                                                  ('scale', Sta

In [None]:
model_pipe.predict(X_test)

array([160984.65083333, 377153.02939819, 595961.87161414, ...,
       790533.29333333, 262569.17502555, 121577.46333333])

In [None]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(model_pipe, X_train, y_train, cv=5, scoring='r2').mean()

0.8427707145814292

In [None]:
# gridsearchcv
params = {
    'model__max_depth':[1,2,3,4,5,None]
}

In [None]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(model_pipe, params, cv=5, scoring='r2')
grid.fit(X_train, y_train)

In [None]:
grid.best_score_

0.8415013571093167

In [None]:
grid.best_params_

{'model__max_depth': None}

---------

In [None]:
# export
import pickle
pickle.dump(model_pipe,open('pipe.pkl','wb'))

In [None]:
# imputation transformer
trf1 = ColumnTransformer([
    ('impute_km_driven',SimpleImputer(),[1]),
    ('impute_owner',SimpleImputer(strategy='most_frequent'),[3])
],remainder='passthrough')

In [None]:
# encoding categorical variables
trf2 = ColumnTransformer(
    [
        ("ordinal", OrdinalEncoder(), [3]),
        ("onehot", OneHotEncoder(sparse_output=False), [0,2])
    ],
    remainder='passthrough'
)

In [None]:
# Scaling
from sklearn.preprocessing import MinMaxScaler
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,37))
])

In [None]:
# Feature selection
trf4 = SelectKBest(score_func=chi2,k=10)

In [None]:
# train the model
trf5 = RandomForestRegressor()

In [None]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('imputer',trf1),
    ('encoder',trf2),
    ('scaler',trf3),
    ('fselector',trf4),
    ('model',trf5)
])

In [None]:
model_pipe.fit(X_train, y_train)

In [None]:
model_pipe.feature_names_in_

array(['brand', 'km_driven', 'fuel', 'owner'], dtype=object)

In [None]:
model_pipe.named_steps

{'transformer': ColumnTransformer(transformers=[('owner_pipe',
                                  Pipeline(steps=[('impute',
                                                   SimpleImputer(strategy='most_frequent')),
                                                  ('encode',
                                                   OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                  unknown_value=-1))]),
                                  ['owner']),
                                 ('ohe',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  ['brand', 'fuel']),
                                 ('km_pipe',
                                  Pipeline(steps=[('impute_km_driven',
                                                   SimpleImputer()),
                                                  ('scale', Sta

In [None]:
model_pipe.predict(X_test)

array([159839.7405    , 365270.38493092, 605967.89821466, ...,
       684009.50380952, 250610.03185381, 123365.24301948])

### Cross Validation

In [None]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(model_pipe, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()

-103193023815.93517

### Hyperparameter Tuning

In [None]:
# gridsearchcv
params = {
    'model__max_depth':[1,2,3,4,5,None]
}

In [None]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 480, in predict
    Xt = transform.transform(Xt)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/compose/_column_transformer.py", line 800, in transfo

In [None]:
grid.best_score_

nan

In [None]:
grid.best_params_

{'model__max_depth': 1}

### Export the Pipeline

In [None]:
# export
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))