In [1]:
# [기본 세팅] ------------------------------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from IPython.display import display

import warnings
warnings.filterwarnings('ignore', category=UserWarning)

# 윈도우 한글 폰트 + 마이너스 표시 깨짐 방지
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

# 표시에 도움되는 옵션 (너무 길게 줄바꿈되는 것 방지)
pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 160)

print("✅ 기본 세팅 완료")

✅ 기본 세팅 완료


In [3]:
# === [번들에서 복원] ===
import pickle

BUNDLE_PATH = "_snapshots/olist_clean_bundle.pkl"

with open(BUNDLE_PATH, "rb") as f:
    bundle = pickle.load(f)

# 변수로 바로 풀어서 사용
orders_clean       = bundle["orders_clean"]
order_items_clean  = bundle["order_items_clean"]
payments_clean     = bundle["payments_clean"]
reviews_clean      = bundle["reviews_clean"]
customers_clean    = bundle["customers_clean"]
geolocation_clean  = bundle["geolocation_clean"]
products_clean     = bundle["products_clean"]
sellers_clean      = bundle["sellers_clean"]
category_tr_clean  = bundle["category_tr_clean"]
rfm =   bundle["rfm"]

print("✅ Restored all *_clean DataFrames")

✅ Restored all *_clean DataFrames


In [None]:
def _ensure_category_en(products_df, cat_tr_df=None):
    """products에 category_en 없으면 번역 테이블로 채우기(없으면 'unknown')."""
    if "category_en" in products_df.columns:
        return products_df
    prod = products_df.copy()
    if cat_tr_df is not None and "product_category_name" in prod.columns:
        m = cat_tr_df.set_index("product_category_name")["product_category_name_english"].to_dict()
        prod["category_en"] = prod["product_category_name"].map(m).fillna("unknown")
    else:
        prod["category_en"] = prod.get("product_category_name", "unknown")
    return prod

# 번역 테이블이 있으면 활용
products_clean = _ensure_category_en(
    products_clean,
    category_tr_clean if "category_tr_clean" in globals() else None
)

# 보기 좋은 색상
_seg_colors = ["#4E79A7","#F28E2B","#76B7B2","#EDC948","#E15759"]


In [5]:
# === S1) 트랜잭션 베이스 ===
valid_status = ["delivered","invoiced","shipped","processing","created"]

tx = (orders_clean[["order_id","customer_id","order_purchase_timestamp","order_status"]]
      .merge(order_items_clean[["order_id","product_id","price"]], on="order_id", how="inner")
      .merge(products_clean[["product_id","category_en"]], on="product_id", how="left"))

tx = tx[tx["order_status"].isin(valid_status)].copy()
tx["order_date"]  = pd.to_datetime(tx["order_purchase_timestamp"], errors="coerce").dt.normalize()
tx["order_month"] = tx["order_date"].dt.to_period("M").dt.to_timestamp()
tx["category_en"] = tx["category_en"].fillna("unknown")
tx["price"] = pd.to_numeric(tx["price"], errors="coerce").fillna(0.0)

In [None]:
# === S2) RFM + 점수화 + 세그먼트 ===
# 고객-주문-날짜 단위 집계(하루 여러 건은 1건으로 보려면 nunique로 충분)
cust_ord = (tx.groupby(["customer_id","order_id","order_date"])["price"]
              .sum().reset_index())

# 기준일: 데이터 내 마지막 주문일 / 같은 날=1일이 되도록 +1
ref_date = cust_ord["order_date"].max()

# 고객별 R/F/M
rfm = (cust_ord.groupby("customer_id")
       .agg(last_purchase=("order_date","max"),
            frequency=("order_id","nunique"),
            monetary=("price","sum"))
       .reset_index())
rfm["recency_days"] = (ref_date - rfm["last_purchase"]).dt.days + 1

# 분위수 기반 점수(1~5). Recency는 작을수록(최근일수록) 좋은 점수.
def quantile_score(series, n=5, higher_is_better=True):
    s = series.fillna(series.median())
    pct = s.rank(pct=True, method="average")         # 0~1 백분위
    score = np.ceil((pct if higher_is_better else (1-pct)) * n)
    return score.astype(int).clip(1, n)

rfm["r_score"] = quantile_score(rfm["recency_days"], n=5, higher_is_better=False)
rfm["f_score"] = quantile_score(rfm["frequency"],   n=5, higher_is_better=True)
rfm["m_score"] = quantile_score(rfm["monetary"],    n=5, higher_is_better=True)

# 가중합 → 상위일수록 tier=1이 되도록 내림차순 퍼센타일 사용
W_R, W_F, W_M = 0.40, 0.35, 0.25
rfm["weighted_score"] = rfm["r_score"]*W_R + rfm["f_score"]*W_F + rfm["m_score"]*W_M

pct_rank = rfm["weighted_score"].rank(pct=True, ascending=False)  # 큰 값이 상위
rfm["tier"] = np.ceil(pct_rank * 5).astype(int).clip(1,5)

# 티어→세그먼트 매핑
tier_to_seg = {1:"VVIP", 2:"VIP", 3:"고액고객", 4:"예비VIP", 5:"유망"}
rfm["segment"] = rfm["tier"].map(tier_to_seg)

# 산출물 1
rfm_customers = rfm.copy()
