In [26]:
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

In [27]:
df = pd.DataFrame({
    "area": [80,100,120,140,160,180,200,220,240,260,90,150],
    "distance_km": [18,16,14,12,10,12,9,7,6,5,20,8]
})

In [28]:
score = 0.03*df["area"] - 0.20*df["distance_km"] + 0.05*np.random.randn(len(df))
df["expensive"] = (score >= score.median()).astype(int)

In [29]:
X = df[["area","distance_km"]]; 
y = df["expensive"]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Scaler để chuẩn hóa các cột tránh thiên vị, 
pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=100)).fit(X_train, y_train)

In [32]:
y_pred  = pipe.predict(X_test)
y_prob1 = pipe.predict_proba(X_test)[:, 1]  # Col 0=>y=0, 1=>y=1 (y)

In [33]:
print("Accuracy :", round(accuracy_score(y_test, y_pred), 3))
print("ROC-AUC  :", round(roc_auc_score(y_test, y_prob1), 3))

Accuracy : 1.0
ROC-AUC  : 1.0


In [34]:
x_new = pd.DataFrame({"area":[185,120], "distance_km":[8,18]})
print("Probabilities:", pipe.predict_proba(x_new)[:,1].round(3).tolist(), " Labels:", pipe.predict(x_new).tolist())


Probabilities: [0.784, 0.074]  Labels: [1, 0]


In [36]:
p_one = pipe.predict_proba(pd.DataFrame({"area":[50], "distance_km":[3]}))[:,1].item()
label_one = pipe.predict(pd.DataFrame({"area":[185], "distance_km":[8]})).item()
print("P(expensive=1):", round(p_one, 3), " -> label:", label_one)

P(expensive=1): 0.396  -> label: 1
