<a href="https://colab.research.google.com/github/adithyaprabhu007/math-coding-notes/blob/main/pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 # -------------------------
# 1. IMPORTS
# -------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# -------------------------
# 2. LOAD DATA
# -------------------------
df = pd.read_csv('/content/Titanic-Dataset.csv')
df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1, inplace=True)

# -------------------------
# 3. SPLIT DATA
# -------------------------
X = df.drop('Survived', axis=1)
y = df['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------------
# 4. BUILD SEPARATE TRANSFORMERS
# -------------------------
# Pipeline for Age
from sklearn.preprocessing import MinMaxScaler

age_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

fare_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

# Pipeline for Sex
sex_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse_output=False, drop='first'))
])

# Pipeline for Embarked
embarked_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse_output=False, drop='first'))
])

# -------------------------
# 5. MERGE INTO COLUMNTRANSFORMER
# -------------------------
preprocessor = ColumnTransformer([
    ('age', age_pipeline, ['Age']),
    ('fare', fare_pipeline, ['Fare']),
    ('sex', sex_pipeline, ['Sex']),
    ('embarked', embarked_pipeline, ['Embarked'])
], remainder='passthrough')  # keep Pclass, SibSp, Parch

# -------------------------
# 6. FULL PIPELINE
# -------------------------
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('select', SelectKBest(score_func=chi2, k=5)),
    ('clf', DecisionTreeClassifier(random_state=42))
])

# -------------------------
# 7. FIT & PREDICT
# -------------------------
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)

# -------------------------
# 8. EVALUATE
# -------------------------
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n✅ Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n✅ Classification Report:\n", classification_report(y_test, y_pred))


✅ Accuracy: 0.7877094972067039

✅ Confusion Matrix:
 [[91 14]
 [24 50]]

✅ Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.87      0.83       105
           1       0.78      0.68      0.72        74

    accuracy                           0.79       179
   macro avg       0.79      0.77      0.78       179
weighted avg       0.79      0.79      0.78       179



In [None]:
from google.colab import drive
drive.mount('/content/drive')