In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml

In [2]:
data = fetch_openml("titanic", version=1, as_frame=True)
df = data.frame

In [3]:
df = df.drop(columns=["name", "ticket", "boat", "body", "home.dest", "cabin"])

In [4]:
df = df.dropna(subset=["survived"])

In [5]:
X = df.drop(columns="survived")
y = df["survived"].astype(int)

In [6]:
num_features = ["age", "fare"]
cat_features = ["sex", "embarked", "pclass"]

In [7]:
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [8]:
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [9]:
preprocessor = ColumnTransformer(transformers=[
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

In [10]:
X_processed = preprocessor.fit_transform(X)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42)

In [13]:
print("Data preprocessing completed successfully.")
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Data preprocessing completed successfully.
Training set shape: (1047, 10)
Testing set shape: (262, 10)
