In [None]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

In [None]:
df = pd.read_csv("../datasets/titanic.csv")

In [None]:
df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], inplace=True)
df.sample(10)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(df.drop(columns=["Survived"]), df["Survived"], test_size=0.2, random_state=42)

In [None]:
#check missing values
df.isnull().sum()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
#using pipelines

num_columns = ["Pclass", "Age", "SibSp", "Parch", "Fare"]
class_columns = ["Sex", "Embarked"]

num_pipeline = Pipeline([
    ("impute_num", SimpleImputer(strategy="mean")),
    ("Scaler", MinMaxScaler())
])

class_pipeline = Pipeline([
    ("impute_class", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("nums", num_pipeline, num_columns),
    ("classes", class_pipeline, class_columns)
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression())
])


In [None]:
pipeline.fit(X_train, Y_train)
Y_pred = pipeline.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(Y_test, Y_pred)
accuracy