In [1]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier

In [11]:
import numpy as np
import pandas as pd
import seaborn as sns 


In [12]:
df = sns.load_dataset('titanic')

In [13]:
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [15]:
## Step -> 1 p 1 -> train/test/split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['survived']),
                                                 df['survived'],
                                                 test_size=0.2,
                                                random_state=42)

In [16]:
X_train.head(3)

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
331,1,male,45.5,0,0,28.5,S,First,man,True,C,Southampton,no,True
733,2,male,23.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
382,3,male,32.0,0,0,7.925,S,Third,man,True,,Southampton,no,True


In [17]:
trf1 = ColumnTransformer([
  ('impute_age', SimpleImputer(),[2]),
  ('impute_embarked', SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

In [18]:
trf2 = ColumnTransformer([

  ('ohe_sex_embarked', OneHotEncoder(handle_unknown='ignore'),[1,6])

],remainder='passthrough')

In [30]:
## Scaling 
trf3 = ColumnTransformer([
  ('scale', MinMaxScaler(),slice(0,10))
  
])

In [31]:
## Feature Selection 
trf4 = SelectKBest(score_func=chi2,k=8)

In [32]:
trf5 = DecisionTreeClassifier()

### Create Pipeline 

In [33]:
pipe = Pipeline([
  ('trf1', trf1),
  ('trf2', trf2),
  ('trf3', trf3),
  ('trf4', trf4),
  ('trf5', trf5),
])

### Pipeline Vs make_pipeline
Pipeline requires naming of steps, make_pipeline does not.

(Same applies to ColumnTransformer vs make_column_transformer)

In [34]:
##  Alternate Syntax 
pipe2 = make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [35]:
pipe2.fit(X_train,X_test)


ValueError: For a sparse output, all columns should be a numeric or convertible to a numeric.

In [26]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore'),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'trf4': SelectKBest(k=8, score_func=<function chi2 at 0x0000025CA73C0F40>),
 'trf5': DecisionTreeClassifier()}

### Cross Validation Using Pipeline

In [27]:
## Cross Validation using cross_val_Score
from sklearn.model_selection import cross_val_score


In [29]:
cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "d:\machine learning\ml-campus\.venv\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1136, in _hstack
    check_array(X, accept_sparse=True, ensure_all_finite=False)
    ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\machine learning\ml-campus\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "d:\machine learning\ml-campus\.venv\Lib\site-packages\sklearn\utils\_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
ValueError: could not convert string to float: 'male'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "d:\machine learning\ml-campus\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\machine learning\ml-campus\.venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\machine learning\ml-campus\.venv\Lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "d:\machine learning\ml-campus\.venv\Lib\site-packages\sklearn\pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ~~~~~~~~~~~~~~~~~~~~~~~~^
        cloned_transformer,
        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
        params=step_params,
        ^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "d:\machine learning\ml-campus\.venv\Lib\site-packages\joblib\memory.py", line 326, in __call__
    return self.func(*args, **kwargs)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "d:\machine learning\ml-campus\.venv\Lib\site-packages\sklearn\pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "d:\machine learning\ml-campus\.venv\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "d:\machine learning\ml-campus\.venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\machine learning\ml-campus\.venv\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1031, in fit_transform
    return self._hstack(list(Xs), n_samples=n_samples)
           ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\machine learning\ml-campus\.venv\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1140, in _hstack
    raise ValueError(
    ...<2 lines>...
    ) from e
ValueError: For a sparse output, all columns should be a numeric or convertible to a numeric.
