<a href="https://colab.research.google.com/github/appliedcode/mthree-c422/blob/mthree-c422-Likhitha/FE_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pandas numpy scikit-learn -q

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.exceptions import DataConversionWarning
from sklearn.preprocessing import StandardScaler
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [3]:
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Survived'])

In [4]:
def drop_unused_columns(df):
    """Drop irrelevant columns."""
    return df.drop(columns=["PassengerId", "Ticket", "Cabin"], errors="ignore").copy()

def clean_data(df):
    """Clean missing values and format basic fields."""
    df = drop_unused_columns(df)
    df["Age"] = df["Age"].fillna(df["Age"].median())
    df["Fare"] = df["Fare"].fillna(df["Fare"].median())
    df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])
    return df

In [6]:
def engineer_features(df):
    """Create new features and encode categorical variables."""
    df = df.copy()

    # Extract title from name
    df["Title"] = df["Name"].str.extract(r",\s*([^\.]+)\.")
    rare_titles = ["Lady", "Countess", "Capt", "Col", "Don", "Dr", "Major",
                   "Rev", "Sir", "Jonkheer", "Dona"]
    df["Title"] = df["Title"].replace(rare_titles, "Rare")
    df["Title"] = df["Title"].replace({"Mlle": "Miss", "Ms": "Miss", "Mme": "Mrs"})

    # Family size and is alone
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"] = (df["FamilySize"] == 1).astype(int)

    # Age and Fare bins
    df["FareBin"] = pd.qcut(df["Fare"], 4, labels=False)
    df["AgeBin"] = pd.cut(df["Age"], bins=[0, 12, 20, 40, 60, 100], labels=False)
    df = df.drop(columns=["Name", "SibSp", "Parch"])

    # One-hot encoding
    df = pd.get_dummies(df, columns=["Sex", "Embarked", "Title"], drop_first=True)

    return df


In [7]:
def validate_data(df):
    """Validate data integrity and completeness."""
    errors = []
    if df.isnull().any().any():
        errors.append("Missing values detected.")

    expected_columns = {"Survived", "Pclass", "Age", "Fare", "FamilySize",
                        "IsAlone", "FareBin", "AgeBin"}
    missing_cols = expected_columns - set(df.columns)
    if missing_cols:
        errors.append(f"Missing columns: {missing_cols}")
    return errors

In [8]:
def prepare_dataset(df):
    """Pipeline: clean, engineer, validate dataset."""
    cleaned = clean_data(df)
    features = engineer_features(cleaned)
    validation_errors = validate_data(features)
    return features, validation_errors


In [9]:
train_prepared, train_errors = prepare_dataset(train_df)
print("Train validation errors:", train_errors or "None")

# Validation Set
val_prepared, val_errors = prepare_dataset(val_df)
print("Validation validation errors:", val_errors or "None")

# Step 5: Save Output
train_prepared.to_csv("titanic_train_prepared.csv", index=False)
val_prepared.to_csv("titanic_val_prepared.csv", index=False)

Train validation errors: None
Validation validation errors: None
