In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [12]:
titanic_df = sns.load_dataset('titanic')[['age', 'embarked', 'deck', 'survived']]

print("--- Original Dataframe with Missing Values ---")
print(titanic_df.head())
print("\nMissing values before imputation:")
print(titanic_df.isnull().sum())



--- Original Dataframe with Missing Values ---
    age embarked deck  survived
0  22.0        S  NaN         0
1  38.0        C    C         1
2  26.0        S  NaN         1
3  35.0        S    C         1
4  35.0        S  NaN         0

Missing values before imputation:
age         177
embarked      2
deck        688
survived      0
dtype: int64


In [13]:
# 2. Define which columns to impute and with which strategy
# We will use a ColumnTransformer to apply different imputers to different columns.

# Use for numerical data without significant outliers. Fills with the column average.
mean_imputer = SimpleImputer(strategy='mean')

# Use for numerical data, especially with outliers. Fills with the robust middle value.
median_imputer = SimpleImputer(strategy='median')

# Use for categorical (string) or discrete numerical data. Fills with the most common value (mode).
most_frequent_imputer = SimpleImputer(strategy='most_frequent')

# Use to fill with a specific value (e.g., 0, 'Unknown') via the 'fill_value' parameter.
constant_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')



In [14]:
# 3. Create a ColumnTransformer to apply different strategies to different columns
# This preprocessor applies a specific imputer to each selected column.
preprocessor = ColumnTransformer(
    transformers=[
        # Apply mean imputation to the 'age' column
        ('age_imputer', mean_imputer, ['age']),
        
        # Apply most frequent imputation to the 'embarked' column
        ('embarked_imputer', most_frequent_imputer, ['embarked']),
        
        # Apply constant imputation to the 'deck' column
        ('deck_imputer', constant_imputer, ['deck'])
    ],
    remainder='passthrough'  # Keep other columns (like 'fare') untouched
)




In [15]:
# 4. Create and apply the full pipeline
# While we only have one step here, using a Pipeline is a best practice.
imputation_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])



In [16]:
# Apply the pipeline to the dataframe
titanic_imputed_array = imputation_pipeline.fit_transform(titanic_df)



In [17]:
# 5. Convert the result back to a DataFrame
# The output of the pipeline is a NumPy array, so we need to get the new column order.
# The ColumnTransformer changes the column order. We can get the correct order from it.
new_columns = ['age', 'embarked', 'deck', 'survived'] # Based on the transformer and remainder order
titanic_imputed_df = pd.DataFrame(titanic_imputed_array, columns=new_columns)

print("\n--- DataFrame After Imputation ---")
print(titanic_imputed_df.head())
print("\nMissing values after imputation:")
print(titanic_imputed_df.isnull().sum())


--- DataFrame After Imputation ---
    age embarked     deck survived
0  22.0        S  Unknown        0
1  38.0        C        C        1
2  26.0        S  Unknown        1
3  35.0        S        C        1
4  35.0        S  Unknown        0

Missing values after imputation:
age         0
embarked    0
deck        0
survived    0
dtype: int64
