##  **0. 라이브러리 불러오기**

In [1]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# 데이터 분포 확인을 위한 plt 라이브러리 import 
import matplotlib.pyplot as plt
import seaborn as sns

## **1. 분석 데이터 불러오기**

In [14]:
## 데이터 전처리 통합 마스터 테이블 불러오기
prep = pd.read_csv("merged_olist_new.csv")
ml_prep = pd.read_csv("ML_olist_new2.csv")

In [15]:
print('[마스터 테이블 컬럼]')
print(prep.columns)
print('='*80)
print('[ml용 마스터 테이블 컬럼]')
print(ml_prep.columns)

[마스터 테이블 컬럼]
Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date',
       'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date',
       'price', 'freight_value', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm',
       'product_category_name_english', 'seller_city', 'seller_state',
       'review_score', 'review_comment_message', 'customer_unique_id',
       'customer_city', 'customer_state', 'payment_sequential',
       'payment_types', 'payment_installments', 'payment_value_total',
       'is_black_friday', 'is_carnival', 'has_photos', 'has_description',
       'description_length', 'has_text_review', 'is_same_state',
       'pg_processing_days', 'seller_processing_days', 'delivery_days',
       'is

#### **내가 진행하고 싶은건!**
1. event_day 시계열로 만들어서 붙이는건 어려움이 있음  
  - 가중치를 두는 건 어떨지?
    - 그러기 위해서는 **월별 배송 지연율 / 월별 처리 지연 비율** 먼저 봐야함
    - **배송 지연율 :** delivery_days > 0 -> is_late_delivery == 1
    - **월별 처리 지연 비율 :** seller_delay_days > 0 -> is_seller_late == 1
    

- is_logistics_fault  : order_delivered_customer_date > order_estimated_delivery_date
- seller_delay_days(양수 = 과실) : order_delivered_carrier_date - shipping_limit_date
- processing_days_diff(값이 클 수록 처리시간 빠름)  카테고리 평균 - seller_processing_days

In [29]:
## 출고 지연된 주문 이진변수 생성
ml_prep2 = ml_prep.copy()
ml_prep2['is_seller_late'] = (ml_prep2['seller_delay_days'] > 0).astype(int)

## **2.seller_profile 만들기**

In [30]:
seller_profile = (
    ml_prep2.groupby("seller_id")
    .agg(
        # 주문 규모
        total_orders=("order_id", "nunique"),

        # 리뷰 지표
        avg_review_score=("review_score", "mean"),
        negative_review_ratio=(
            "review_score",
            lambda x: (x <= 3).mean()
        ),

        # 출고/배송 지표
        seller_late_ratio=("is_seller_late", "mean"),
        avg_seller_late_days=("seller_delay_days", "mean"),
        avg_proc_days_diff=("processing_days_diff", "mean")
    )
    .reset_index()
)

seller_profile

Unnamed: 0,seller_id,total_orders,avg_review_score,negative_review_ratio,seller_late_ratio,avg_seller_late_days,avg_proc_days_diff
0,001cca7ae9ae17fb1caed9dfb1094831,104,3.975000,0.258333,0.041667,-3.552083,-0.468333
1,002100f778ceb8431b7a1020ff7ab48f,9,3.700000,0.400000,0.000000,-3.659000,-1.109000
2,004c9cd9d87a3c30c522c48c4fc07416,85,4.336735,0.183673,0.000000,-5.311224,-1.156939
3,00720abe85ba0859807595bbf045a33b,7,3.500000,0.562500,0.125000,-3.871875,0.038125
4,00ab3eff1b5192e5f1a63bcecfee11c8,1,5.000000,0.000000,0.000000,-1.100000,1.260000
...,...,...,...,...,...,...,...
2666,ffc470761de7d0232558ba5e786e57b7,21,4.636364,0.136364,0.000000,-3.338636,-1.385909
2667,ffdd9f82b9a447f6f8d4b91554cc7dd3,8,4.500000,0.125000,0.250000,-1.735000,1.911250
2668,ffeee66ac5d5a62fe688b9d26f83f534,4,3.750000,0.250000,0.500000,5.297500,7.895000
2669,fffd5413c0700ac820c7069d66d98c89,24,4.375000,0.166667,0.041667,-3.307083,-0.843750


#### **2-1. 유의 판매자 기준 설정**

In [34]:
seller_profile.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9])

Unnamed: 0,total_orders,avg_review_score,negative_review_ratio,seller_late_ratio,avg_seller_late_days,avg_proc_days_diff
count,2671.0,2671.0,2671.0,2671.0,2671.0,2671.0
mean,21.518158,4.252686,0.184215,0.10974,-3.151623,0.309338
std,62.760662,0.777261,0.242384,0.231098,3.229273,3.139663
min,1.0,1.0,0.0,0.0,-32.626667,-8.931579
10%,1.0,3.363636,0.0,0.0,-5.5025,-1.79
25%,2.0,4.0,0.0,0.0,-4.34,-1.238528
50%,5.0,4.378049,0.117647,0.0,-3.258125,-0.515
75%,16.0,4.833333,0.258065,0.10323,-2.22,0.848873
90%,47.0,5.0,0.5,0.353659,-0.755571,3.007778
max,1073.0,5.0,1.0,1.0,45.43,45.17


In [35]:
16+1.5*(16-2)

37.0

| 구분 | 기준 | 수식 |
|-----|------|------|
|상위 판매자| 상위 10% | total_orders >= **47**|
|중위 판매자| 상위 25% | total_orders >= **16**|

- Q3 + 1.5 * IQR = 37

In [36]:
## seller_level 구분
seller_profile['seller_level'] = np.where(
    seller_profile['total_orders'] >= 47, 1,
    np.where(seller_profile['total_orders'] >= 16, 2, 3)
)

In [37]:
seller_profile_lvl1 = seller_profile[seller_profile['seller_level'] == 1].copy()
seller_profile_lvl1.describe()

Unnamed: 0,total_orders,avg_review_score,negative_review_ratio,seller_late_ratio,avg_seller_late_days,avg_proc_days_diff,seller_level
count,271.0,271.0,271.0,271.0,271.0,271.0,271.0
mean,138.509225,4.205472,0.1976,0.081164,-3.37297,-0.074058,1.0
std,151.03443,0.284306,0.083308,0.099108,1.429263,1.485641,0.0
min,47.0,2.896552,0.014085,0.0,-9.95322,-6.111573,1.0
25%,61.0,4.045335,0.140977,0.01653,-3.886259,-1.053288,1.0
50%,83.0,4.235294,0.1875,0.044025,-3.230595,-0.450339,1.0
75%,146.0,4.393318,0.247612,0.102968,-2.611301,0.441569,1.0
max,1073.0,4.789474,0.568966,0.566667,0.621202,8.736094,1.0


In [38]:
seller_profile_lvl2 = seller_profile[seller_profile['seller_level'] == 2].copy()
seller_profile_lvl2.describe()

Unnamed: 0,total_orders,avg_review_score,negative_review_ratio,seller_late_ratio,avg_seller_late_days,avg_proc_days_diff,seller_level
count,407.0,407.0,407.0,407.0,407.0,407.0,407.0
mean,27.02457,4.23568,0.191858,0.089161,-3.266729,-0.012074,2.0
std,8.659366,0.377424,0.114126,0.125065,1.516754,1.933983,0.0
min,16.0,1.64,0.0,0.0,-10.74725,-8.931579,2.0
25%,19.0,4.065591,0.108402,0.0,-3.855513,-1.138403,2.0
50%,26.0,4.277778,0.171429,0.042553,-3.33871,-0.6005,2.0
75%,34.0,4.5,0.25641,0.125,-2.575761,0.685921,2.0
max,46.0,5.0,0.9,0.848485,4.126061,14.413333,2.0


## **3. 머신러닝 모델**  
  
- iqr이상치 기준으로 설정

In [39]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
    precision_recall_curve,
    roc_curve
)

In [None]:
# 0) 준비: seller_profile 확인

required_cols = [
    "seller_id",
    "total_orders",
    "avg_review_score",
    "negative_review_ratio",
    "seller_late_ratio",
    "avg_seller_late_days",
    "avg_proc_days_diff",
]
missing = [c for c in required_cols if c not in seller_profile.columns]
if missing:
    raise ValueError(f"seller_profile에 필요한 컬럼이 없습니다: {missing}")

df = seller_profile.copy()

In [143]:
# 1) 최소 주문 수 필터로 안정성 확보??

MIN_ORDERS = 10
df = df[df["total_orders"] >= MIN_ORDERS].reset_index(drop=True)

In [144]:
# 2) IQR 기반 임계값 계산 함수

def iqr_upper_threshold(s: pd.Series, k: float = 1.5) -> float:
    q1 = s.quantile(0.25)
    q3 = s.quantile(0.75)
    iqr = q3 - q1
    return q3 + k * iqr

In [145]:
# 3) 라벨 생성

risk_cols = ["negative_review_ratio", "seller_late_ratio", "avg_proc_days_diff"]

thresholds = {col: iqr_upper_threshold(df[col], k=1.5) for col in risk_cols}
print("[IQR thresholds]")
for k, v in thresholds.items():
    print(f" - {k}: {v:.6f}")

df["is_risky_seller"] = (
    (df["negative_review_ratio"] > thresholds["negative_review_ratio"]) |
    (df["seller_late_ratio"] > thresholds["seller_late_ratio"]) |
    (df["avg_proc_days_diff"] > thresholds["avg_proc_days_diff"])
).astype(int)

print("\n[Label distribution]")
print(df["is_risky_seller"].value_counts(dropna=False))
print("Positive ratio:", df["is_risky_seller"].mean())

[IQR thresholds]
 - negative_review_ratio: 0.431762
 - seller_late_ratio: 0.277276
 - avg_proc_days_diff: 3.148343

[Label distribution]
is_risky_seller
0    510
1     62
Name: count, dtype: int64
Positive ratio: 0.10839160839160839


In [146]:
# 4) Feature 구성

feature_cols = [
    "total_orders",
    "negative_review_ratio",
    "seller_late_ratio",
    #"avg_seller_late_days",
    "avg_proc_days_diff",
]

X = df[feature_cols].copy()
y = df["is_risky_seller"].copy()

# 결측 처리(있을 경우 대비) - RF는 NaN 직접 처리 불가
if X.isna().any().any():
    X = X.fillna(X.median(numeric_only=True))

In [147]:
# 5) Train/Test split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [148]:
# 6) Random Forest 학습

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight="balanced",
    min_samples_leaf=5,
    n_jobs=-1
)
rf.fit(X_train, y_train)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",300
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",5
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [149]:
# 7) 예측 및 평가

y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]

print("\n[Confusion Matrix]")
print(confusion_matrix(y_test, y_pred))

print("\n[Classification Report]")
print(classification_report(y_test, y_pred, digits=4))

roc_auc = roc_auc_score(y_test, y_proba)
pr_auc = average_precision_score(y_test, y_proba)

print(f"[ROC-AUC] {roc_auc:.4f}")
print(f"[PR-AUC ] {pr_auc:.4f}")


[Confusion Matrix]
[[101   2]
 [  4   8]]

[Classification Report]
              precision    recall  f1-score   support

           0     0.9619    0.9806    0.9712       103
           1     0.8000    0.6667    0.7273        12

    accuracy                         0.9478       115
   macro avg     0.8810    0.8236    0.8492       115
weighted avg     0.9450    0.9478    0.9457       115

[ROC-AUC] 0.9854
[PR-AUC ] 0.9000


In [158]:
# 8) 임계값 변경 모델

THRESH = 0.62
y_pred_custom = (y_proba >= THRESH).astype(int)

print(f"\n[Custom threshold = {THRESH}]")
print(confusion_matrix(y_test, y_pred_custom))
print(classification_report(y_test, y_pred_custom, digits=4))


[Custom threshold = 0.62]
[[103   0]
 [  4   8]]
              precision    recall  f1-score   support

           0     0.9626    1.0000    0.9810       103
           1     1.0000    0.6667    0.8000        12

    accuracy                         0.9652       115
   macro avg     0.9813    0.8333    0.8905       115
weighted avg     0.9665    0.9652    0.9621       115



In [None]:
# 9) 중요 변수 확인

importances = pd.Series(rf.feature_importances_, index=feature_cols).sort_values(ascending=False)
print("\n[Feature Importances]")
print(importances)


[Feature Importances]
seller_late_ratio        0.668946
negative_review_ratio    0.243267
total_orders             0.087787
dtype: float64


In [None]:
# 10) 위험 판매자 Top-N 뽑기 (전체 데이터 기준)

df["risk_proba"] = rf.predict_proba(X)[:, 1]
top_risky = df.sort_values("risk_proba", ascending=False)[
    ["seller_id", "risk_proba", "is_risky_seller"] + feature_cols
].head(30)

print("\n[Top 30 risky sellers]")
display(top_risky)


[Top 30 risky sellers]


Unnamed: 0,seller_id,risk_proba,is_risky_seller,total_orders,avg_review_score,negative_review_ratio,seller_late_ratio,avg_seller_late_days,avg_proc_days_diff
377,a49928bcdf77c55c6d6e05e09a9b4ca5,0.999263,1,31,2.742857,0.6,0.4,1.467429,4.609714
248,6fd52c528dcb38be2eea044946b811f8,0.999106,1,31,2.787879,0.515152,0.424242,-1.05303,14.413333
305,835f0f7810c76831d6c7d24c7a646d4d,0.994599,1,28,3.515152,0.424242,0.848485,4.126061,6.342727
357,99002261c568a84cce14d43fcffb43ea,0.992758,1,21,3.904762,0.428571,0.333333,-0.535238,3.431905
494,db2956745b3a8e9f3785c99f34b5d25e,0.991746,1,31,4.0,0.3125,0.5,-0.466562,4.549688
116,2eb70248d66e0e3ef83659f71b244378,0.990858,1,129,2.905109,0.562044,0.328467,-0.018467,8.18365
10,04308b1ee57b6625f47df1d56f00eedf,0.99016,1,51,3.647059,0.352941,0.392157,-1.214118,4.653725
355,972d0f9cf61b499a4812cf0bfa3ad3c4,0.989962,1,38,2.871795,0.615385,0.384615,-0.85,2.230769
75,1ca7077d890b907f89be8c954a02686a,0.989726,1,37,1.64,0.9,0.54,-0.5174,2.3502
333,8e6d7754bc7e0f22c96d255ebda59eba,0.987697,1,71,2.896552,0.568966,0.508621,-0.129828,2.300517
