In [40]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

In [41]:
df = pd.read_csv("../datasets/titanic.csv")

In [42]:
df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], inplace=True)
df.sample(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
789,0,1,male,46.0,0,0,79.2,C
512,1,1,male,36.0,0,0,26.2875,S
26,0,3,male,,0,0,7.225,C
117,0,2,male,29.0,1,0,21.0,S
887,1,1,female,19.0,0,0,30.0,S
250,0,3,male,,0,0,7.25,S
200,0,3,male,28.0,0,0,9.5,S
671,0,1,male,31.0,1,0,52.0,S
549,1,2,male,8.0,1,1,36.75,S
812,0,2,male,35.0,0,0,10.5,S


In [43]:
X_train, X_test, Y_train, Y_test = train_test_split(df.drop(columns=["Survived"]), df["Survived"], test_size=0.2, random_state=42)

In [44]:
#check missing values
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [45]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
#using pipelines

num_columns = ["Pclass", "Age", "SibSp", "Parch", "Fare"]
class_columns = ["Sex", "Embarked"]

num_pipeline = Pipeline([
    ("impute_num", SimpleImputer(strategy="mean")),
    ("Scaler", MinMaxScaler())
])

class_pipeline = Pipeline([
    ("impute_class", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("nums", num_pipeline, num_columns),
    ("classes", class_pipeline, class_columns)
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression())
])


In [46]:
pipeline.fit(X_train, Y_train)
Y_pred = pipeline.predict(X_test)

In [48]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(Y_test, Y_pred)
accuracy

0.7932960893854749