In [1]:
!pip install pandas numpy scikit-learn -q

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.exceptions import DataConversionWarning
import warnings

# Suppress warnings for clearer output
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [2]:
# Step 2: Load and Split the Dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Split into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Survived'])

In [3]:
# Step 3: Define the Pipeline Functions
def clean_data(df):
    df = df.drop(columns=["PassengerId","Ticket","Cabin"], errors="ignore").copy()
    # Impute Age and Embarked
    df["Age"] = df["Age"].fillna(df["Age"].median())
    df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])
    return df

def engineer_features(df):
    df = df.copy()
    # Title extraction
    df["Title"] = df["Name"].str.extract(r",\s*([^\.]+)\.")
    rare_titles = ["Lady","Countess","Capt","Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"]
    df["Title"] = df["Title"].replace(rare_titles, "Rare")
    # Family size & is alone
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"] = (df["FamilySize"] == 1).astype(int)
    # Fare and Age bins
    df["FareBin"] = pd.qcut(df["Fare"].fillna(0), 4, labels=False)
    df["AgeBin"]  = pd.cut(df["Age"], bins=[0,12,20,40,60,100], labels=False)
    # Drop unused columns
    df = df.drop(columns=["Name","SibSp","Parch"])
    # One-hot encode
    df = pd.get_dummies(df, columns=["Sex","Embarked","Title"], drop_first=True)
    return df

def validate_data(df):
    errors = []
    # Check for nulls
    null_counts = df.isnull().sum()
    if null_counts.any():
        errors.append(f"Null values found:\n{null_counts[null_counts>0]}")
    # Check expected columns
    expected_cols = {"Survived","Pclass","Age","Fare","FamilySize","IsAlone","FareBin","AgeBin"}
    missing = expected_cols - set(df.columns)
    if missing:
        errors.append(f"Missing columns: {missing}")
    return errors

In [4]:
# Step 4: Execute the Pipeline
# Clean and feature-engineer training data
train_clean = clean_data(train_df)
train_feat  = engineer_features(train_clean)
train_errors = validate_data(train_feat)
print("Train validation errors:", train_errors or "None")

# Clean and feature-engineer validation data
val_clean = clean_data(val_df)
val_feat  = engineer_features(val_clean)
val_errors = validate_data(val_feat)
print("Validation validation errors:", val_errors or "None")

Train validation errors: None
Validation validation errors: None


In [5]:
# Step 5: Save Prepared Data
train_feat.to_csv("titanic_train_prepared.csv", index=False)
val_feat.to_csv("titanic_val_prepared.csv", index=False)



 1. How did validation checks help catch data quality issues early?
Validation checks ensured that:
- No null values remained after cleaning and feature engineering.
- Key engineered columns (e.g., AgeBin, FareBin, IsAlone) were present.
- The dataset structure matched expectations before modeling.

This helped prevent runtime errors and inconsistent inputs later in the machine learning pipeline.

---

 2. Which engineered features contributed most to dataset richness?
The following features significantly improved the dataset predictive power:
- Title: Extracted from the passenger's name, this adds insight into social status, gender, and age.
- FamilySize and IsAlone: Captured social structure and travel patterns, which affected survival likelihood.
- FareBin and AgeBin: Transformed continuous variables into categorical bins, capturing non-linear effects more effectively.

---

 3. How would you extend this pipeline to include scaling, imputation strategies, or integrate with a model training step?
To extend the pipeline:
- Scaling: Apply StandardScaler or MinMaxScaler to numerical features (e.g., Age, Fare) using ColumnTransformer.
- Advanced Imputation: Use SimpleImputer or KNNImputer for more robust handling of missing values.
- Model Training Integration: Use sklearn.pipeline.Pipeline to combine feature engineering and model training into one consistent pipeline.
- Cross-validation & tuning: Wrap the pipeline in GridSearchCV or RandomizedSearchCV for hyperparameter optimization.

This would make the pipeline fully automated and production-ready.
