# 04 · Purchase Propensity Model
Logistic regression baseline with RFM + AOV features.

In [None]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss

DATA = Path("../data")
tx = pd.read_csv(DATA/"transactions.csv", parse_dates=["date"])
rfm = pd.read_csv(DATA/"user_rfm.csv")

last_date = tx["date"].max()
horizon_start = last_date - pd.Timedelta(days=28)
future_start = last_date - pd.Timedelta(days=14)

base = (tx[(tx["date"]>=horizon_start) & (tx["date"]<=last_date)]
        .groupby("user_id").agg(aov=("line_revenue","mean")).reset_index().fillna({"aov":0.0}))
future_users = tx[tx["date"]>future_start]["user_id"].unique()

X = rfm.merge(base, on="user_id", how="left").fillna({"aov":0.0})
y = X["user_id"].isin(future_users).astype(int)

features = ["recency_days","frequency","monetary","aov"]
X_train, X_test, y_train, y_test = train_test_split(X[features], y, test_size=0.25, random_state=42, stratify=y)

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=500))
])
pipe.fit(X_train, y_train)
proba = pipe.predict_proba(X_test)[:,1]

print({
    "roc_auc": float(roc_auc_score(y_test, proba)),
    "pr_auc": float(average_precision_score(y_test, proba)),
    "brier": float(brier_score_loss(y_test, proba))
})