In this notebook, i will compare 2 xgb pipelines for the similarity of the result. Why? The reason is right now i have two of them where on trained on already processed data and only had the preproce and model in the pipeline while the second, was trained on raw data with custom pipelne. Why i compare them?: I just wanted to make sure that both of them is similar and i did not make a mistake in training. 

Note: it is important to note that 2 pipelines will not give the same result because they were trained separately.

In [1]:
from pathlib import Path

import joblib
import pandas as pd
import numpy as np
import json
import os, sys

In [2]:
PROJECT_ROOT = Path(r"C:\Users\User\all_project\projects_in_github\titanic_project")
sys.path.insert(0, str(PROJECT_ROOT))

In [3]:
BASE_DIR = Path.cwd().parent
ART_DIR = BASE_DIR / "artifacts" / "model_data"
FULL_MODEL_PATH = ART_DIR / "xgb_pipeline_raw.joblib"
MODEL_PATH = ART_DIR / "xgb_pipeline.joblib"
TRAIN_RAW_PATH = BASE_DIR / "data" / "raw" / "train_titanic.csv"
TRAIN_PROCESSED_PATH = BASE_DIR / "data" / "processed" / "titanic_processed.csv"

In [4]:
def load_model(path):
    return joblib.load(path)

def predict_survival_proba(dataset: pd.DataFrame, model) -> float:
    return model.predict_proba(dataset)[0,1]

def load_datasets(path):
    df= pd.read_csv(path)
    X=df.drop(columns=["Survived"], axis=1)
    y=df["Survived"].astype(int).values
    return X, y

def eval_at_threshold(model, X: pd.DataFrame, y: np.ndarray, thr: float) -> dict:
    proba = model.predict_proba(X)[:, 1]
    pred = (proba >= thr).astype(int)

    y = y.astype(int)

    tp = int(((y == 1) & (pred == 1)).sum())
    tn = int(((y == 0) & (pred == 0)).sum())
    fp = int(((y == 0) & (pred == 1)).sum())
    fn = int(((y == 1) & (pred == 0)).sum())

    acc = (tp + tn) / (tp + tn + fp + fn + 1e-12)
    precision = tp / (tp + fp + 1e-12)
    recall = tp / (tp + fn + 1e-12)
    f1 = 2 * precision * recall / (precision + recall + 1e-12)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [5]:
#loading threshold
with (ART_DIR / "threshold.json").open("r", encoding="utf-8") as f:
    threshold = json.load(f)['threshold_f1']

#loading models
full_model = load_model(FULL_MODEL_PATH)
model = load_model(MODEL_PATH)

#loading datasets
X_raw, y_raw = load_datasets(TRAIN_RAW_PATH)
X_proc, y_proc = load_datasets(TRAIN_PROCESSED_PATH)

#quick changes for raw dataset to match features
X_raw["Title"] = X_raw["Name"].str.split(",").str[1].str.split(".").str[0].str.strip()
X_raw.drop(columns=["Name"], inplace=True)

#analyzing performance
print("Performance on raw dataset with full custom pipeline:")
eval_raw=eval_at_threshold(full_model, X_raw, y_raw, threshold)
for k, v in eval_raw.items():
    print(f"  {k}: {v}")
print("\nPerformance on processed dataset with standard pipeline:")
eval_proc=eval_at_threshold(model, X_proc, y_proc, threshold)
for k, v in eval_proc.items():
    print(f"  {k}: {v}")
print("Differences (raw - processed):")
for k in eval_proc:
    diff = eval_raw[k] - eval_proc[k]
    print(f"  {k}: {diff}")

Performance on raw dataset with full custom pipeline:
  accuracy: 0.9057239057239047
  precision: 0.868571428571426
  recall: 0.8888888888888862
  f1: 0.8786127167625032

Performance on processed dataset with standard pipeline:
  accuracy: 0.8945005611672268
  precision: 0.8542857142857118
  recall: 0.8742690058479506
  f1: 0.86416184971048
Differences (raw - processed):
  accuracy: 0.01122334455667795
  precision: 0.014285714285714235
  recall: 0.014619883040935533
  f1: 0.014450867052023142
