<a href="https://colab.research.google.com/github/aakashkumarme/ML-LEARN/blob/main/7_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.random.seed(42)

data = pd.read_csv("/content/drive/MyDrive/dataset/car-sales-extended-missing-data.csv")
data


Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [None]:
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [None]:
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

#Steps to do:


1.   Fill missing data
2.   Convert data to numbers
3.   Build a model on the data




In [None]:
#Getting data ready
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

#Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


#drop missing values where targets are missing
data.dropna(subset=["Price"], inplace=True)
print(len(data))

950


In [None]:
#Define features
categorical_features = ['Make','Colour']
categorical_transformer = Pipeline(steps=[
                                          ("imputer" ,SimpleImputer(strategy="constant",fill_value="missing")),
                                          ("onehot",OneHotEncoder(handle_unknown="ignore"))
])



door_feature = ["Doors"]
door_transformer = Pipeline(steps=[
                                   ("imputer" , SimpleImputer(strategy="constant",fill_value=4))
])



numeric_feature = ["Odometer (KM)"]
numeric_transformer = Pipeline(steps=[
                                    ("imputer" , SimpleImputer(strategy="mean"))])
################################################################################

#Setup preprocessing steps (fill missing values and convert to numbers)
preprocessor = ColumnTransformer(
    transformers=[
                  ("cat" , categorical_transformer,categorical_features),
                  ("door",door_transformer,door_feature),
                  ("num", numeric_transformer,numeric_feature)
                  ])

################################################################################

model = Pipeline(steps=[("preprocessor",preprocessor),
                        ("model",RandomForestRegressor())])

################################################################################

X = data.drop("Price",axis=1)
y = data["Price"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

model.fit(X_train,y_train)
model.score(X_test,y_test)

0.3008111853212634

In [None]:
#use GridSearch CV
from sklearn.model_selection import GridSearchCV
pipe_grid = {
    "preprocessor__num__imputer__strategy" : ["mean" , "median"],
    "model__n_estimators" : [100,1000],
    "model__max_depth": [None,5],
    "model__max_features" : ["auto"],
    "model__min_impurity_split": [None]
}

gs_model = GridSearchCV(model,pipe_grid , cv=5 , verbose=2  )
gs_model.fit(X_train,y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] model__max_depth=None, model__max_features=auto, model__min_impurity_split=None, model__n_estimators=100, preprocessor__num__imputer__strategy=mean 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  model__max_depth=None, model__max_features=auto, model__min_impurity_split=None, model__n_estimators=100, preprocessor__num__imputer__strategy=mean, total=   0.3s
[CV] model__max_depth=None, model__max_features=auto, model__min_impurity_split=None, model__n_estimators=100, preprocessor__num__imputer__strategy=mean 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV]  model__max_depth=None, model__max_features=auto, model__min_impurity_split=None, model__n_estimators=100, preprocessor__num__imputer__strategy=mean, total=   0.3s
[CV] model__max_depth=None, model__max_features=auto, model__min_impurity_split=None, model__n_estimators=100, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=auto, model__min_impurity_split=None, model__n_estimators=100, preprocessor__num__imputer__strategy=mean, total=   0.3s
[CV] model__max_depth=None, model__max_features=auto, model__min_impurity_split=None, model__n_estimators=100, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=None, model__max_features=auto, model__min_impurity_split=None, model__n_estimators=100, preprocessor__num__imputer__strategy=mean, total=   0.3s
[CV] model__max_depth=None, model__max_features=auto, model__min_impurity_split=None, model__n_estimators=100, preprocessor__num__imputer__strategy=mean 
[CV]  model__max_depth=None, mo

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:   47.2s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('cat',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                                    