<a href="https://colab.research.google.com/github/benasphy/Logistic-Regression/blob/main/With_Pipeline_vs_without.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# 1. 🚢 Load and prepare the Titanic dataset
data = sns.load_dataset('titanic')

# Keep selected columns and drop rows with missing values
df = data[['age', 'fare', 'sex', 'embarked', 'survived']].dropna()

# Encode categorical features
df['sex'] = df['sex'].map({'male': 0, 'female': 1})
df['embarked'] = df['embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# Features and target
X = df.drop('survived', axis=1)
y = df['survived']

# 2. ✅ Version with Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression())
])

scores_pipeline = cross_val_score(pipeline, X, y, cv=5)
print("✅ With pipeline (accuracy):", scores_pipeline.mean())

# 3. ❌ Version without Pipeline (manual steps)
# Manual scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Model
model = LogisticRegression()
scores_manual = cross_val_score(model, X_scaled, y, cv=5)
print("❌ Without pipeline (accuracy):", scores_manual.mean())


✅ With pipeline (accuracy): 0.7752191470501331
❌ Without pipeline (accuracy): 0.7752191470501331


✅ Both perform the same, but the pipeline is safer, cleaner, and preferred in real projects.