In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import mlflow
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [None]:
sns.set_style("whitegrid")

%matplotlib inline

In [None]:
mlflow.set_tracking_uri("http://0.0.0.0:5001")
mlflow.set_experiment("titanic")
mlflow.autolog()

In [None]:
DATASOURCE = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

In [None]:
data = pd.read_csv(DATASOURCE)
data.columns = map(str.lower, data.columns)

In [None]:
X_cols = ["age", "sibsp", "parch"]
y_col = ["survived"]

In [None]:
def transform_sex(df: pd.DataFrame) -> pd.DataFrame:
    df["ismale"] = 0
    df.loc[df["sex"] == "male", "ismale"] = 1
    del df["sex"]
    return df

In [None]:
train, test = train_test_split(data, test_size=0.2)

In [None]:
train.info()

In [None]:
median_age = train["age"].median()

In [None]:
train["age"] = train["age"].fillna(median_age)

In [None]:
train = transform_sex(train)

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(train[X_cols], train[y_col].values.ravel())

In [None]:
y_train_pred = lr.predict(train[X_cols])
y_train = train[y_col].values.ravel()

In [None]:
print(classification_report(y_train, y_train_pred))

In [None]:
test["age"] = test["age"].fillna(median_age)
test = transform_sex(test)

In [None]:
y_pred = lr.predict(test[X_cols])
y = test[y_col].values.ravel()

In [None]:
print(classification_report(y, y_pred))

In [None]:
# Finish