<a href="https://colab.research.google.com/github/aninmath/machine_learning/blob/main/Titanic_dataset_with_sklearn_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = sns.load_dataset('titanic')

In [None]:
df.head()

In [None]:
x = df.drop(['survived','alive'], axis=1)
y = df['survived']

In [None]:
df.parch.value_counts()

In [None]:
x.isnull().sum()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
# Define column types

categorical_cols = ['sex', 'pclass','sibsp','parch','embarked','class','who','adult_male','embark_town','alone','deck']
numerical_cols = ['age','fare']

In [None]:
x_train.head(1)

In [None]:
x_train.isnull().sum()

In [None]:

# Categorical pipeline: Impute missing, then OHE
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
 ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Numerical pipeline: Impute missing, then scale
numerical_transformer = Pipeline(steps=[('imputer',
                                         SimpleImputer(strategy='mean')),('scaler', MinMaxScaler())])

# Combine transformers
preprocessor = ColumnTransformer(transformers=[('cat', categorical_transformer, categorical_cols),
 ('num', numerical_transformer, numerical_cols)])

# Final pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),('classifier', LogisticRegression())])


In [None]:
pipeline.fit(x_train, y_train)

In [None]:
pipeline.score(x_test,y_test)

In [None]:
x_train.head(1)

## **Say I want to use different imputation for different column**

In [None]:

mean_impute_cols = ['age']
mode_impute_cols = categorical_cols


In [None]:
x_train.isnull().sum()[x_train.isnull().sum()>0]

In [None]:

mean_imputer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])


mode_imputer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')
     )
])



In [None]:

# Combine all preprocessing
preprocessor = ColumnTransformer(transformers=[
    ('mean_impute', mean_imputer, numerical_cols),
    ('mode_impute', mode_imputer, categorical_cols)
])

# Final pipeline: preprocessing + scaling + model
final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])


In [None]:
print(final_pipeline)

In [None]:

from sklearn import set_config
set_config(display='diagram')  # Enables visual display

final_pipeline  # Just type the pipeline object to render it


In [None]:
final_pipeline.fit(x_train, y_train)

In [None]:
final_pipeline.score(x_test,y_test)

In [None]:
#accuracy

from sklearn.metrics import accuracy_score
y_pred = final_pipeline.predict(x_test)
accuracy_score(y_test, y_pred)