In [15]:
import pandas as pd
from pathlib import Path

CSV_PATH = "/Users/burakbozatli/Desktop/data/shippingdata.csv"
df = pd.read_csv(CSV_PATH)

print("Shape:", df.shape)
print("Columns:", list(df.columns))
print(df.head(3))

print("\nTarget (Reached.on.Time_Y.N) value counts:")
print(df["Reached.on.Time_Y.N"].value_counts(dropna=False))

print("\nDtypes:")
print(df.dtypes)

Shape: (10999, 12)
Columns: ['ID', 'Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases', 'Product_importance', 'Gender', 'Discount_offered', 'Weight_in_gms', 'Reached.on.Time_Y.N']
   ID Warehouse_block Mode_of_Shipment  Customer_care_calls  Customer_rating  \
0   1               D           Flight                    4                2   
1   2               F           Flight                    4                5   
2   3               A           Flight                    2                2   

   Cost_of_the_Product  Prior_purchases Product_importance Gender  \
0                  177                3                low      F   
1                  216                2                low      M   
2                  183                4                low      M   

   Discount_offered  Weight_in_gms  Reached.on.Time_Y.N  
0                44           1233                    1  
1                59           3088  

In [None]:
#Hedef kolon: Reached.on.Time_Y.N (1=on-time, 0=late). Sınıf dağılımı: 6563 / 4436 → çok dengesiz değil, ama 1’ler biraz fazla.
#Tipler: kategorikler (Warehouse_block, Mode_of_Shipment, Product_importance, Gender) object; sayısallar int. Beklenen gibi.

In [17]:
df = df.copy()
df["OnTime"] = df["Reached.on.Time_Y.N"]
df = df.drop(columns=["ID"], errors="ignore")

print(df.isna().sum().sort_values(ascending=False))
print(df["OnTime"].value_counts())

Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
Reached.on.Time_Y.N    0
OnTime                 0
dtype: int64
OnTime
1    6563
0    4436
Name: count, dtype: int64


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder

y = df["OnTime"]
X = df.drop(columns=["OnTime", "Reached.on.Time_Y.N"])

cat_cols = ["Warehouse_block", "Mode_of_Shipment", "Product_importance", "Gender"]
num_cols = [c for c in X.columns if c not in cat_cols]

print("Categorical:", cat_cols)
print("Numeric:", num_cols)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)
print("Train:", X_train.shape, "Test:", X_test.shape,
      "| OnTime=1 (train):", int(y_train.sum()), "of", y_train.shape[0])

pre_lr = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", StandardScaler(), num_cols),
    ],
    remainder="drop",
)

pre_rf = ColumnTransformer(
    transformers=[
        ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), cat_cols),
        ("num", "passthrough", num_cols),
    ],
    remainder="drop",
)


Categorical: ['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender']
Numeric: ['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases', 'Discount_offered', 'Weight_in_gms']
Train: (8799, 10) Test: (2200, 10) | OnTime=1 (train): 5250 of 8799


In [21]:
import time
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report
)

TH_40 = 0.40

pipe_lr = Pipeline(steps=[
    ("prep", pre_lr),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42))
])

t0 = time.time()
pipe_lr.fit(X_train, y_train)
fit_s_lr = time.time() - t0

y_proba = pipe_lr.predict_proba(X_test)[:, 1]
y_pred  = (y_proba >= TH_40).astype(int)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec  = recall_score(y_test, y_pred)
f1   = f1_score(y_test, y_pred)
roc  = roc_auc_score(y_test, y_proba)
cm   = confusion_matrix(y_test, y_pred)

print(f"[LR @th=0.40] acc={acc:.3f} prec={prec:.3f} rec={rec:.3f} f1={f1:.3f} auc={roc:.3f} | fit_s={fit_s_lr:.2f}s")
print("CM:\n", cm)
print("\nReport @0.40:\n", classification_report(y_test, y_pred, digits=3))

[LR @th=0.40] acc=0.639 prec=0.706 rec=0.676 f1=0.691 auc=0.717 | fit_s=0.03s
CM:
 [[518 369]
 [426 887]]

Report @0.40:
               precision    recall  f1-score   support

           0      0.549     0.584     0.566       887
           1      0.706     0.676     0.691      1313

    accuracy                          0.639      2200
   macro avg      0.627     0.630     0.628      2200
weighted avg      0.643     0.639     0.640      2200



In [None]:
#AUC=0.717 → ayırma gücü fena değil.
#th=0.40’ta P=0.706 / R=0.676 / F1=0.691: dengeli; ama FP=369, FN=426 (CM’den).
#ihtiyaca göre eşiği oynatmak mantıklı (FP↓ için th↑, FN↓ için th↓).

In [23]:
from sklearn.metrics import precision_recall_curve, confusion_matrix, classification_report
import numpy as np

prec_arr, rec_arr, th_arr = precision_recall_curve(y_test, y_proba)
f1s = 2 * (prec_arr * rec_arr) / (prec_arr + rec_arr + 1e-12)

best_idx = int(f1s.argmax())
best_th = th_arr[best_idx-1] if best_idx > 0 else 0.5
print(f"[LR] Best F1 threshold ≈ {best_th:.3f} | P={prec_arr[best_idx]:.3f} R={rec_arr[best_idx]:.3f} F1={f1s[best_idx]:.3f}")

y_pred_best = (y_proba >= best_th).astype(int)
print("CM @best_th:\n", confusion_matrix(y_test, y_pred_best))
print("Report @best_th:\n", classification_report(y_test, y_pred_best, digits=3))

target_recall = 0.80
idx = np.where(rec_arr >= target_recall)[0]
if len(idx):
    i = idx[0]
    th_recall = th_arr[i-1] if i > 0 else th_arr[0]
    y_pred_r = (y_proba >= th_recall).astype(int)
    print(f"[LR] th for recall≥{target_recall:.2f}: {th_recall:.3f} | P={prec_arr[i]:.3f} R={rec_arr[i]:.3f}")
    print(confusion_matrix(y_test, y_pred_r))
    print(classification_report(y_test, y_pred_r, digits=3))
else:
    print(f"[LR] recall≥{target_recall:.2f} sağlanamadı.")

[LR] Best F1 threshold ≈ 0.177 | P=0.598 R=0.998 F1=0.748
CM @best_th:
 [[   5  882]
 [   3 1310]]
Report @best_th:
               precision    recall  f1-score   support

           0      0.625     0.006     0.011       887
           1      0.598     0.998     0.748      1313

    accuracy                          0.598      2200
   macro avg      0.611     0.502     0.379      2200
weighted avg      0.609     0.598     0.451      2200

[LR] th for recall≥0.80: 0.141 | P=0.597 R=1.000
[[   0  887]
 [   0 1313]]
              precision    recall  f1-score   support

           0      0.000     0.000     0.000       887
           1      0.597     1.000     0.748      1313

    accuracy                          0.597      2200
   macro avg      0.298     0.500     0.374      2200
weighted avg      0.356     0.597     0.446      2200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
RUN_MLFLOW = True
try:
    import mlflow, mlflow.sklearn
    from mlflow.models.signature import infer_signature
    import matplotlib.pyplot as plt
    from pathlib import Path
except Exception:
    RUN_MLFLOW = False

if RUN_MLFLOW:
    ART_DIR = Path("artifacts"); ART_DIR.mkdir(exist_ok=True)
    mlflow.set_experiment("logistics-demo")
    with mlflow.start_run(run_name="lr-th-0p40"):
        mlflow.log_param("model", "LogisticRegression")
        mlflow.log_param("class_weight", "balanced")
        mlflow.log_param("threshold", 0.40)

        mlflow.log_metric("accuracy", float(acc))
        mlflow.log_metric("precision", float(prec))
        mlflow.log_metric("recall", float(rec))
        mlflow.log_metric("f1", float(f1))
        mlflow.log_metric("roc_auc", float(roc))

        fig, ax = plt.subplots()
        ax.imshow(cm); ax.set_title("CM LR @th=0.40")
        for (i, j), v in np.ndenumerate(cm):
            ax.text(j, i, int(v), ha="center", va="center")
        p = ART_DIR / "cm_lr_th040.png"
        fig.tight_layout(); fig.savefig(p, dpi=150); plt.close(fig)
        mlflow.log_artifact(str(p), artifact_path="plots")

        signature = infer_signature(X_train, pipe_lr.predict(X_train))
        mlflow.sklearn.log_model(pipe_lr, "model", signature=signature)

    print("[MLflow] Logged LR run: logistics-demo / lr-th-0p40")



[MLflow] Logged LR run: logistics-demo / lr-th-0p40


In [41]:
import time, numpy as np
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report, precision_recall_curve
)

try:
    pre_rf
except NameError:
    pre_rf = ColumnTransformer(
        transformers=[
            ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), cat_cols),
            ("num", "passthrough", num_cols),
        ],
        remainder="drop",
    )

RF_PARAMS = dict(
    n_estimators=1000, max_depth=14, max_features="sqrt",
    min_samples_leaf=2, bootstrap=True, max_samples=0.8,
    n_jobs=-1, random_state=42
)

rf = Pipeline(steps=[
    ("prep", pre_rf),
    ("rf", RandomForestClassifier(**RF_PARAMS))
])

t0 = time.time()
rf.fit(X_train, y_train)
fit_s = time.time() - t0
print(f"[INFO] RF retrain done in {fit_s:.2f}s")

proba_rf = rf.predict_proba(X_test)[:, 1]

def eval_at(th):
    pred = (proba_rf >= th).astype(int)
    return {
        "th": th,
        "acc": accuracy_score(y_test, pred),
        "prec": precision_score(y_test, pred),
        "rec":  recall_score(y_test, pred),
        "f1":   f1_score(y_test, pred),
        "auc":  roc_auc_score(y_test, proba_rf),
        "cm":   confusion_matrix(y_test, pred)
    }

res_040 = eval_at(0.40)
res_050 = eval_at(0.50)
print("\n[RF] @th=0.40:", {k:(float(v) if k!='cm' else v.tolist()) for k,v in res_040.items()})
print("[RF] @th=0.50:", {k:(float(v) if k!='cm' else v.tolist()) for k,v in res_050.items()})

best = {"th": None, "f1": -1}
for th in np.arange(0.40, 0.551, 0.01):
    pred = (proba_rf >= th).astype(int)
    f1v = f1_score(y_test, pred)
    if f1v > best["f1"]:
        best = {
            "th": float(th),
            "f1": float(f1v),
            "acc": float(accuracy_score(y_test, pred)),
            "prec": float(precision_score(y_test, pred)),
            "rec": float(recall_score(y_test, pred)),
            "cm": confusion_matrix(y_test, pred).tolist()
        }
print("\n[RF] Best F1 in 0.40–0.55:", best)

RUN_MLFLOW = True
try:
    import mlflow, mlflow.sklearn
    import matplotlib.pyplot as plt
    from mlflow.models.signature import infer_signature
except Exception:
    RUN_MLFLOW = False

if RUN_MLFLOW:
    ART_DIR = Path("artifacts"); ART_DIR.mkdir(exist_ok=True)

    cm040 = res_040["cm"]
    fig, ax = plt.subplots()
    ax.imshow(cm040); ax.set_title("CM RF @th=0.40")
    for (i, j), v in np.ndenumerate(cm040): ax.text(j, i, int(v), ha="center", va="center")
    p = ART_DIR / "cm_rf_th040.png"
    fig.tight_layout(); fig.savefig(p, dpi=150); plt.close(fig)

    mlflow.set_experiment("logistics-demo")
    with mlflow.start_run(run_name="rf-full-th040"):
        mlflow.log_params(RF_PARAMS)
        mlflow.log_param("threshold", 0.40)
        mlflow.log_metric("acc_at_040", float(res_040["acc"]))
        mlflow.log_metric("prec_at_040", float(res_040["prec"]))
        mlflow.log_metric("rec_at_040",  float(res_040["rec"]))
        mlflow.log_metric("f1_at_040",   float(res_040["f1"]))
        mlflow.log_metric("auc_at_040",  float(res_040["auc"]))
        mlflow.log_metric("best_f1_in_040_055", float(best["f1"]))
        mlflow.log_artifact(str(p), artifact_path="plots")

        signature = infer_signature(X_train, rf.predict(X_train))
        input_example = X_train.head(1)

        mlflow.sklearn.log_model(
            sk_model=rf,
            name="rf_model",
            signature=signature,
            input_example=input_example
        )
    print("[MLflow] RF model logged with signature & input_example")


[INFO] RF retrain done in 1.34s

[RF] @th=0.40: {'th': 0.4, 'acc': 0.6213636363636363, 'prec': 0.636830102622577, 'rec': 0.8507235338918507, 'f1': 0.7283990870557548, 'auc': 0.7372549760396212, 'cm': [[250, 637], [196, 1117]]}
[RF] @th=0.50: {'th': 0.5, 'acc': 0.6654545454545454, 'prec': 0.8312284730195177, 'rec': 0.5514089870525514, 'f1': 0.663003663003663, 'auc': 0.7372549760396212, 'cm': [[740, 147], [589, 724]]}

[RF] Best F1 in 0.40–0.55: {'th': 0.4, 'f1': 0.7283990870557548, 'acc': 0.6213636363636363, 'prec': 0.636830102622577, 'rec': 0.8507235338918507, 'cm': [[250, 637], [196, 1117]]}




[MLflow] RF model logged with signature & input_example


In [None]:
#th=0.40: acc=0.621, prec=0.637, rec=0.851, F1=0.728, AUC=0.737
#→ recall güçlü (gecikmeleri daha çok yakalıyor), F1 de LR’dan yüksek (LR F1≈0.691).
#th=0.50: acc=0.666, prec=0.831, rec=0.551, F1=0.663
#→ precision güçlü, ama recall düşüyor.

In [43]:
from sklearn.inspection import permutation_importance
import pandas as pd
import numpy as np

r = permutation_importance(
    rf, X_test, y_test,
    scoring="roc_auc", n_repeats=5, random_state=42, n_jobs=-1
)

feat_names = rf.named_steps["prep"].get_feature_names_out()
imp = pd.Series(r.importances_mean, index=feat_names).sort_values(ascending=False)

imp.index = [s.replace("cat__", "").replace("num__", "") for s in imp.index]

print("Top-15 permutation importance (roc_auc):")
print(imp.head(15).round(6))

Top-15 permutation importance (roc_auc):
Weight_in_gms          0.083637
Discount_offered       0.026240
Cost_of_the_Product    0.002189
Mode_of_Shipment       0.000986
Warehouse_block       -0.000744
Gender                -0.001687
Customer_rating       -0.002084
Customer_care_calls   -0.003019
Prior_purchases       -0.004328
Product_importance    -0.008837
dtype: float64


In [None]:
#Weight_in_gms açık ara en faydalı sinyal.
#Discount_offered ikinci; indirim arttıkça gecikme/“on-time” olasılığı nasıl değişiyor, kontrol etmeye değer.
#Diğerlerinin negatif olması “kötü etkiliyor” demek değil;
#permütasyon gürültüsü ve etkileşim/kollinearite yüzünden ortalama etki sıfır civarında 
#model onlar olmadan da benzer performansı yakalıyor olabilir.

In [45]:
import numpy as np
import pandas as pd

X_te = X_test.copy()
X_te["proba_rf"] = proba_rf

def binned_table(col, q=10):
    bins = np.unique(np.quantile(X_te[col].astype(float), np.linspace(0,1,q+1)))
    labels = [f"[{bins[i]:.0f}, {bins[i+1]:.0f})" for i in range(len(bins)-1)]
    cut = pd.cut(X_te[col].astype(float), bins=bins, include_lowest=True, labels=labels)
    tbl = (
        X_te.groupby(cut, observed=True)
            .agg(n=("proba_rf","size"),
                 mean_proba=("proba_rf","mean"),
                 ontime_rate=("proba_rf", lambda s: np.mean((s>=0.5).astype(int))))
            .reset_index()
            .rename(columns={col:"bin"})
    )
    return tbl

tbl_w = binned_table("Weight_in_gms", q=10)
tbl_d = binned_table("Discount_offered", q=10)

print("\n[Weight_in_gms] bin -> n, mean_proba, ontime_rate@0.5")
print(tbl_w.to_string(index=False))

print("\n[Discount_offered] bin -> n, mean_proba, ontime_rate@0.5")
print(tbl_d.to_string(index=False))



[Weight_in_gms] bin -> n, mean_proba, ontime_rate@0.5
         bin   n  mean_proba  ontime_rate
[1002, 1329) 220    0.688896     0.568182
[1329, 1664) 220    0.671765     0.513636
[1664, 2029) 220    0.669726     0.581818
[2029, 3364) 220    0.984323     1.000000
[3364, 4201) 220    0.730377     0.672727
[4201, 4531) 221    0.430454     0.108597
[4531, 4889) 219    0.425446     0.123288
[4889, 5265) 220    0.432068     0.113636
[5265, 5624) 220    0.429576     0.122727
[5624, 7588) 220    0.443025     0.154545

[Discount_offered] bin -> n, mean_proba, ontime_rate@0.5
     bin   n  mean_proba  ontime_rate
  [1, 2) 331    0.482168     0.265861
  [2, 3) 184    0.465972     0.250000
  [3, 4) 161    0.439662     0.136646
  [4, 6) 334    0.476995     0.224551
  [6, 7) 186    0.456708     0.166667
  [7, 8) 142    0.448488     0.183099
 [8, 10) 341    0.457008     0.181818
[10, 19)  88    0.994867     1.000000
[19, 45) 222    0.997397     1.000000
[45, 65) 211    0.997002     1.000000


In [None]:
#Weight_in_gms: 4.2–5.6kg civarında mean_proba ≈ 0.43 ve on-time oranı çok düşük → bu aralık gecikme riski yüksek.
#Buna karşılık 2.0–3.36kg aralığında mean_proba ~0.98 ve on-time %100 🤨
#Discount_offered: %10+ indirimlerde mean_proba ≈ 0.995–0.997 ve on-time %100.
#Bu da “indirim yüksekse hep zamanında” gibi basamaklı bir davranış gösteriyor.

In [47]:
import pandas as pd
import numpy as np

X_te = X_test.copy()
X_te["OnTime"] = y_test.values

X_te["high_disc"] = (X_te["Discount_offered"] >= 10).astype(int)
print("OnTime rate by high_disc:\n", X_te.groupby("high_disc")["OnTime"].mean())

bins_w = [X_te["Weight_in_gms"].min()-1, 2029, 3364, 4201, 5624, X_te["Weight_in_gms"].max()+1]
labels_w = ["<2029", "2029-3364", "3364-4201", "4201-5624", ">=5624"]
X_te["w_bucket"] = pd.cut(X_te["Weight_in_gms"], bins=bins_w, labels=labels_w, include_lowest=True)
print("\nOnTime rate by weight bucket:\n", X_te.groupby("w_bucket")["OnTime"].mean())

print("\nOnTime by high_disc x Mode_of_Shipment:")
print(pd.crosstab(X_te["high_disc"], X_te["Mode_of_Shipment"], values=X_te["OnTime"], aggfunc="mean"))

print("\nCounts by high_disc x Mode_of_Shipment:")
print(pd.crosstab(X_te["high_disc"], X_te["Mode_of_Shipment"]))

print("\nOnTime by weight bucket x Mode_of_Shipment:")
print(pd.crosstab(X_te["w_bucket"], X_te["Mode_of_Shipment"], values=X_te["OnTime"], aggfunc="mean"))

OnTime rate by high_disc:
 high_disc
0    0.470981
1    0.865906
Name: OnTime, dtype: float64

OnTime rate by weight bucket:
 w_bucket
<2029        0.706061
2029-3364    1.000000
3364-4201    0.763636
4201-5624    0.419318
>=5624       0.409091
Name: OnTime, dtype: float64

OnTime by high_disc x Mode_of_Shipment:
Mode_of_Shipment    Flight      Road      Ship
high_disc                                     
0                 0.454545  0.492126  0.469591
1                 0.878261  0.824074  0.872385

Counts by high_disc x Mode_of_Shipment:
Mode_of_Shipment  Flight  Road  Ship
high_disc                           
0                    242   254  1003
1                    115   108   478

OnTime by weight bucket x Mode_of_Shipment:
Mode_of_Shipment    Flight      Road      Ship
w_bucket                                      
<2029             0.696629  0.714286  0.706009
2029-3364         1.000000  1.000000  1.000000
3364-4201         0.676471  0.914286  0.748344
4201-5624         0.436709  

  print("\nOnTime rate by weight bucket:\n", X_te.groupby("w_bucket")["OnTime"].mean())


In [None]:
#1) Discount etkisi
#High discount (≥20–45 arası gibi) verilenlerde OnTime oranı %100’e çıkmış.
#Buna karşılık düşük indirimlerde (1–10 arası) OnTime oranı %18–26 civarında kalıyor.
#Yani indirim, müşteri için değil ama lojistik operasyon için güçlü bir sinyal gibi davranıyor.
#2) Weight (ağırlık) etkisi
#2029–3364 gr. aralığında %100 zamanında teslim var.
#3364–4201 gr. aralığında %76 civarında.
#Ama 4201+ gr. olduğunda oran %40’a kadar düşüyor.
#Demek ki ağırlık arttıkça zamanında teslim ihtimali düşüyor.
#3) Mode of Shipment (kargo tipi) × indirim
#High discount verilenlerde bütün taşıma tipleri %82–87 arası başarıyla gidiyor.
#Düşük discount verilenlerde %45–49 arası kalıyor.
#Discount, shipment tipinden bağımsız olarak güçlü bir faktör.
#4) Mode of Shipment × Weight
#Hafif paketlerde (≤2029) bütün taşıma tipleri %70 civarında.
#Orta ağırlıklarda (2029–3364) %100 başarı.
#Ağır paketlerde (4200 üstü) bütün taşıma tiplerinde %39–43’e kadar düşüyor.
#Taşıma tipinden çok ağırlık belirleyici.

In [49]:
from sklearn.inspection import permutation_importance
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

ART_DIR = Path("artifacts"); ART_DIR.mkdir(exist_ok=True)

r = permutation_importance(
    rf, X_test, y_test,
    scoring="roc_auc", n_repeats=5, random_state=42, n_jobs=-1
)

feat_names = rf.named_steps["prep"].get_feature_names_out()
pi = pd.Series(r.importances_mean, index=feat_names).sort_values(ascending=True)

pi.index = [s.replace("cat__", "").replace("num__", "") for s in pi.index]

pi_path = ART_DIR / "permutation_importance.csv"
pi.to_csv(pi_path, header=["importance"])

fig, ax = plt.subplots(figsize=(6, 8))
pi.tail(20).plot(kind="barh", ax=ax)
ax.set_title("Permutation Importance (top 20) — scoring=roc_auc")
ax.set_xlabel("Δ roc_auc")
fig.tight_layout()
pi_png = ART_DIR / "permutation_importance_top20.png"
fig.savefig(pi_png, dpi=150); plt.close(fig)

print(f"saved: {pi_path}")
print(f"saved: {pi_png}")

try:
    import mlflow
    mlflow.set_experiment("logistics-demo")
    with mlflow.start_run(run_name="rf-permutation-importance"):
        mlflow.log_artifact(str(pi_path), artifact_path="analysis")
        mlflow.log_artifact(str(pi_png), artifact_path="plots")
    print("[MLflow] logged permutation importance artifacts")
except Exception:
    pass

saved: artifacts/permutation_importance.csv
saved: artifacts/permutation_importance_top20.png
[MLflow] logged permutation importance artifacts


In [None]:
#Weight_in_gms (~0.084 ΔAUC): açık ara en kritik sinyal. Bu özelliği bozunca test ROC-AUC ~0.084 düşüyor.
#Discount_offered (~0.026 ΔAUC): ikinci güçlü sinyal.
#Geri kalanı ≈0 ya da hafif negatif: tek başına bozduğunda modelin AUC’si neredeyse değişmiyor → katkıları marjinal / etkileşimlere bağımlı.

In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

ART_DIR = Path("artifacts"); ART_DIR.mkdir(exist_ok=True)

X_te = X_test.copy()
X_te["proba_rf"] = proba_rf
X_te["OnTime"]   = y_test.values

def make_binned_plot(col, q=10):
    # quantile bin'leri ve etiketler
    bins = np.unique(np.quantile(X_te[col].astype(float), np.linspace(0, 1, q+1)))
    labels = [f"[{bins[i]:.0f},{bins[i+1]:.0f})" for i in range(len(bins)-1)]
    cut = pd.cut(X_te[col].astype(float), bins=bins, include_lowest=True, labels=labels)

    tbl = (
        X_te.groupby(cut, observed=True)
            .agg(n=("proba_rf","size"),
                 mean_proba=("proba_rf","mean"),
                 ontime_rate=("OnTime","mean"))
            .reset_index()
            .rename(columns={col:"bin"})
    )

    fig, ax1 = plt.subplots(figsize=(9, 4))
    ax2 = ax1.twinx()
    ax1.plot(range(len(tbl)), tbl["mean_proba"], marker="o", label="Mean Pred Proba")
    ax1.plot(range(len(tbl)), tbl["ontime_rate"], marker="s", label="OnTime Rate")
    ax2.bar(range(len(tbl)), tbl["n"], alpha=0.25, label="Count")

    ax1.set_title(f"Binned probability — {col}")
    ax1.set_ylabel("Rate / Probability")
    ax2.set_ylabel("Bin Count")

    ax1.set_xticks(range(len(tbl["bin"])))
    ax1.set_xticklabels(tbl["bin"], rotation=45, ha="right")

    ax1.legend(loc="upper left")
    fig.tight_layout()

    out_png = ART_DIR / f"binned_proba_{col}.png"
    out_csv = ART_DIR / f"binned_table_{col}.csv"
    fig.savefig(out_png, dpi=150); plt.close(fig)
    tbl.to_csv(out_csv, index=False)

    print(f"saved: {out_png}")
    print(f"saved: {out_csv}")
    return out_png, out_csv

p1_png, p1_csv = make_binned_plot("Weight_in_gms", q=10)
p2_png, p2_csv = make_binned_plot("Discount_offered", q=10)

try:
    import mlflow
    mlflow.set_experiment("logistics-demo")
    with mlflow.start_run(run_name="rf-binned-plots-fixed"):
        for p in [p1_png, p2_png]:
            mlflow.log_artifact(str(p), artifact_path="plots")
        for c in [p1_csv, p2_csv]:
            mlflow.log_artifact(str(c), artifact_path="analysis")
    print("[MLflow] logged binned plots & tables (fixed)")
except Exception:
    pass

saved: artifacts/binned_proba_Weight_in_gms.png
saved: artifacts/binned_table_Weight_in_gms.csv
saved: artifacts/binned_proba_Discount_offered.png
saved: artifacts/binned_table_Discount_offered.csv
[MLflow] logged binned plots & tables (fixed)


In [None]:
#Weight_in_gms (tablo/grafik):
#4.2–5.6 kg (ör. [4201,4531), [4531,4889), [4889,5265), [5265,5624)) dilimlerinde on-time rate ≈ %40–48 → belirgin risk bandı.
#2.0–3.36 kg civarı dilimlerde oran çok yüksek.
#Modelin mean predicted probası da aynı paterni takip ediyor → model & veri uyumlu.
#Discount_offered (tablo/grafik):
#%10–%19 ve %19–%45 dilimlerinde on-time rate ≈ 1.0 ve mean proba ≈ 0.995–0.997 → pratikte “garanti” bölge.
#%1–%8 arası dilimlerde hem oran hem proba düşük/orta.

In [69]:
import mlflow
from mlflow.tracking import MlflowClient

REG_NAME = "logistics-on-time"
print("tracking_uri =", mlflow.get_tracking_uri())
print("registry_uri =", mlflow.get_registry_uri())

client = MlflowClient()
vers = client.search_model_versions(f"name='{REG_NAME}'")
assert vers, "Registry'de versiyon bulunamadı."
latest = max(vers, key=lambda v: int(v.version))
print("latest version:", latest.version, "| status:", latest.status, "| stage:", latest.current_stage)

tracking_uri = file:///Users/burakbozatli/mlruns
registry_uri = file:///Users/burakbozatli/mlruns
latest version: 2 | status: READY | stage: Staging


In [81]:
import os, shutil, time, re, sys
from pathlib import Path
import mlflow
from mlflow.tracking import MlflowClient

try:
    import yaml
except Exception:
    raise RuntimeError("PyYAML gerekli. Kur: pip install pyyaml")

REG_NAME   = "logistics-on-time" 
TARGET_STAGE = "Staging"         
TARGET_VERSION = None            

reg_uri = mlflow.get_registry_uri()
if not reg_uri.startswith("file://"):
    raise SystemExit(f"Registry file-store değil: {reg_uri} (Bu yama sadece file:// için)")

base_dir = Path(reg_uri.replace("file://", ""))
if not base_dir.exists():
    raise SystemExit(f"Registry dizini yok: {base_dir}")

print("[info] registry_uri =", reg_uri)
print("[info] base_dir     =", base_dir)

client = MlflowClient()
vers = client.search_model_versions(f"name='{REG_NAME}'")
if not vers:
    raise SystemExit(f"Model bulunamadı: {REG_NAME}")

if TARGET_VERSION is None:
    TARGET_VERSION = max(int(v.version) for v in vers)
print(f"[info] hedef versiyon = v{TARGET_VERSION}")

meta_dir = base_dir / "models" / REG_NAME / f"version-{TARGET_VERSION}"
meta_path = meta_dir / "meta.yaml"
if not meta_path.exists():
    raise SystemExit(f"meta.yaml bulunamadı: {meta_path}")

ts = time.strftime("%Y%m%d-%H%M%S")
backup_path = meta_dir / f"meta.yaml.bak-{ts}"
shutil.copy2(meta_path, backup_path)
print(f"[ok] yedek alındı → {backup_path}")

with open(meta_path, "r", encoding="utf-8") as f:
    data = yaml.safe_load(f)

if not isinstance(data, dict):
    raise SystemExit("meta.yaml beklenen formatta değil (dict).")

data["current_stage"] = str(TARGET_STAGE)
data["name"] = data.get("name", REG_NAME)
data["version"] = str(data.get("version", TARGET_VERSION))

for key in ["latest_metrics", "aliases", "tags"]:
    if key in data:
        try:
            del data[key]
        except Exception:
            pass

def sanitize(obj):
    if isinstance(obj, dict):
        return {str(k): sanitize(v) for k, v in obj.items()}
    elif isinstance(obj, (list, tuple, set)):
        return [sanitize(x) for x in obj]
    elif isinstance(obj, (int, float, str)) or obj is None or isinstance(obj, bool):
        return obj
    else:
        return str(obj)

data = sanitize(data)

tmp_path = meta_dir / f".meta.yaml.tmp-{ts}"
with open(tmp_path, "w", encoding="utf-8") as f:
    yaml.safe_dump(
        data, f, default_flow_style=False, allow_unicode=True, sort_keys=True
    )
shutil.move(str(tmp_path), str(meta_path))
print(f"[ok] meta.yaml güncellendi → stage='{TARGET_STAGE}'")

with open(meta_path, "r", encoding="utf-8") as f:
    chk = yaml.safe_load(f)
print("[verify] current_stage:", chk.get("current_stage"), "| version:", chk.get("version"))

print("\n MLflow UI → Models →", REG_NAME, "→ Versions → v", TARGET_VERSION,
      " .", sep="")

[info] registry_uri = file:///Users/burakbozatli/mlruns
[info] base_dir     = /Users/burakbozatli/mlruns
[info] hedef versiyon = v2
[ok] yedek alındı → /Users/burakbozatli/mlruns/models/logistics-on-time/version-2/meta.yaml.bak-20250916-174353
[ok] meta.yaml güncellendi → stage='Staging'
[verify] current_stage: Staging | version: 2

 MLflow UI → Models →logistics-on-time→ Versions → v2 .


In [83]:
import time, shutil
from pathlib import Path
import mlflow
from mlflow.tracking import MlflowClient

REG_NAME = "logistics-on-time"
TARGET_VERSION = None      
ARCHIVE_OLD = True     

client = MlflowClient()

# 1) Versiyonu bul
vers = client.search_model_versions(f"name='{REG_NAME}'")
assert vers, f"Model yok: {REG_NAME}"
if TARGET_VERSION is None:
    TARGET_VERSION = str(max(int(v.version) for v in vers))

print(f"[info] hedef: {REG_NAME} v{TARGET_VERSION} → Production")

try:
    client.transition_model_version_stage(
        name=REG_NAME, version=TARGET_VERSION,
        stage="Production", archive_existing_versions=ARCHIVE_OLD
    )
    mv = client.get_model_version(REG_NAME, TARGET_VERSION)
    print(f"[ok] API ile Production'a alındı → stage={mv.current_stage}")
    done = True
except Exception as e:
    print("[warn] API ile transition hata verdi:", repr(e))
    done = False

if not done:
    reg_uri = mlflow.get_registry_uri()
    if not reg_uri.startswith("file://"):
        raise SystemExit(f"Registry file-store değil: {reg_uri} (fallback yalnız file:// için)")
    base_dir = Path(reg_uri.replace("file://", ""))
    meta_dir = base_dir / "models" / REG_NAME / f"version-{TARGET_VERSION}"
    meta_path = meta_dir / "meta.yaml"
    assert meta_path.exists(), f"meta.yaml yok: {meta_path}"

    ts = time.strftime("%Y%m%d-%H%M%S")
    backup_path = meta_dir / f"meta.yaml.bak-{ts}"
    shutil.copy2(meta_path, backup_path)

    import yaml
    with open(meta_path, "r", encoding="utf-8") as f:
        data = yaml.safe_load(f)
    if not isinstance(data, dict):
        raise SystemExit("meta.yaml beklenmeyen format")

    data["current_stage"] = "Production"
    data["name"] = data.get("name", REG_NAME)
    data["version"] = str(data.get("version", TARGET_VERSION))
    for key in ["latest_metrics", "aliases", "tags"]:
        if key in data:
            del data[key]

    def sanitize(obj):
        if isinstance(obj, dict):
            return {str(k): sanitize(v) for k,v in obj.items()}
        if isinstance(obj, (list, tuple, set)):
            return [sanitize(x) for x in obj]
        if isinstance(obj, (int, float, str, type(None), bool)):
            return obj
        return str(obj)

    data = sanitize(data)
    tmp = meta_dir / f".meta.yaml.tmp-{ts}"
    with open(tmp, "w", encoding="utf-8") as f:
        yaml.safe_dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=True)
    tmp.replace(meta_path)

    with open(meta_path, "r", encoding="utf-8") as f:
        chk = yaml.safe_load(f)
    print(f"[ok] Fallback ile Production → stage={chk.get('current_stage')} | version={chk.get('version')}")


[info] hedef: logistics-on-time v2 → Production
[warn] API ile transition hata verdi: RepresenterError('cannot represent an object', <Metric: dataset_digest=None, dataset_name=None, key='prec_at_040', model_id='m-c9ddb59bf72a49aea670dc1cdd542fad', run_id='2886c2319cf747cdabdaf10edf39b111', step=0, timestamp=1758025509868, value=0.636830102622577>)
[ok] Fallback ile Production → stage=Production | version=2


  client.transition_model_version_stage(


In [87]:
from mlflow.tracking import MlflowClient
client = MlflowClient()
client.set_registered_model_alias("logistics-on-time", "Production", 2)