In [85]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import MissingIndicator,SimpleImputer   

df=sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [86]:
df=df.drop(columns=['pclass','who','deck','embark_town','alive','sex','parch','sibsp','embarked','class','adult_male','alone'])


In [87]:
df.head()
X=df.drop(columns=['survived'])
y=df['survived']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [88]:
X_train.head()

Unnamed: 0,age,fare
331,45.5,28.5
733,23.0,13.0
382,32.0,7.925
704,26.0,7.8542
813,6.0,31.275


In [89]:
si=SimpleImputer()
X_train_trf = si.fit_transform(X_train)
X_test_trf = si.transform(X_test)

In [90]:
X_train_trf

array([[ 45.5   ,  28.5   ],
       [ 23.    ,  13.    ],
       [ 32.    ,   7.925 ],
       ...,
       [ 41.    ,  14.1083],
       [ 14.    , 120.    ],
       [ 21.    ,  77.2875]], shape=(712, 2))

In [91]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression()
clf.fit(X_train_trf,y_train)

y_pred=clf.predict(X_test_trf)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6480446927374302

In [92]:
mi =MissingIndicator()
mi.fit(X_train)

X_train_missing=mi.transform(X_train)

In [93]:
X_train_missing

X_test_missing=mi.transform(X_test)


In [94]:
X_train['Age_na']=X_train_missing

X_test['Age_na'] = X_test_missing

In [95]:
X_test

Unnamed: 0,age,fare,Age_na
709,,15.2458,True
439,31.0,10.5000,False
840,20.0,7.9250,False
720,6.0,33.0000,False
39,14.0,11.2417,False
...,...,...,...
433,17.0,7.1250,False
773,,7.2250,True
25,38.0,31.3875,False
84,17.0,10.5000,False


In [96]:

si = SimpleImputer()

X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

In [97]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

clf.fit(X_train_trf2,y_train)

y_pred = clf.predict(X_test_trf2)

from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6368715083798883

In [98]:

si = SimpleImputer(add_indicator=True)
X_train = si.fit_transform(X_train)

X_test = si.transform(X_test)

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

clf.fit(X_train_trf2,y_train)

y_pred = clf.predict(X_test_trf2)

from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6368715083798883

Automatically Select Imputer

In [99]:

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
 


In [100]:
df=sns.load_dataset('titanic')

df=df.drop(columns=['who','deck','embark_town','alive','embarked','class','adult_male','alone'])
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [101]:
X=df.drop(columns=['survived'])
y=df['survived']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)


In [102]:
X_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare
331,1,male,45.5,0,0,28.5
733,2,male,23.0,0,0,13.0
382,3,male,32.0,0,0,7.925
704,3,male,26.0,1,0,7.8542
813,3,female,6.0,4,2,31.275


In [103]:
numerical_features = ['age','fare']
numerical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

categorical_features =['embarked','sex']
categorical_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ohe',OneHotEncoder(handle_unknown='ignore'))
])

In [104]:
preprocessor=ColumnTransformer(
    transformers=[
        ('num',numerical_transformer,numerical_features),
        ('cat',categorical_transformer,categorical_features)
    ]
)

clf=Pipeline(steps=[('preprocessor',preprocessor),
                    ('classifier',LogisticRegression())
])

In [105]:
from sklearn import set_config
clf

In [106]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],  # Numerical imputer
    'preprocessor__cat__imputer__strategy': ['most_frequent', 'constant'],  # Categorical imputer
    'classifier__C': [0.1, 1.0, 10, 100]  # Logistic Regression hyperparameter
}
grid_search=GridSearchCV(clf,param_grid,cv=10)
grid_search.fit(X_train,y_train)

grid_search.best_params_

ValueError: 
All the 160 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
160 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ayush\AppData\Local\Programs\Python\Python313\Lib\site-packages\pandas\core\indexes\base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
           ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\\_libs\\hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas\\_libs\\hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'embarked'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\ayush\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\_indexing.py", line 364, in _get_column_indices
    col_idx = all_columns.get_loc(col)
  File "c:\Users\ayush\AppData\Local\Programs\Python\Python313\Lib\site-packages\pandas\core\indexes\base.py", line 3812, in get_loc
    raise KeyError(key) from err
KeyError: 'embarked'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\ayush\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ayush\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\ayush\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "c:\Users\ayush\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ~~~~~~~~~~~~~~~~~~~~~~~~^
        cloned_transformer,
        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
        params=step_params,
        ^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\ayush\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "c:\Users\ayush\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "c:\Users\ayush\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "c:\Users\ayush\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\ayush\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\compose\_column_transformer.py", line 993, in fit_transform
    self._validate_column_callables(X)
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^
  File "c:\Users\ayush\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\compose\_column_transformer.py", line 552, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
                                         ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^
  File "c:\Users\ayush\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\_indexing.py", line 372, in _get_column_indices
    raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe
