In [3]:
# === GitHub Event Anomaly Detection (Train: Aug 1~7 10–24h @ dataset, Test: Jun 1 0–24h @ test)
# Single Jupyter cell version with rich logging & NaN-safe feature engineering
# Python 3.9+ recommended

from pathlib import Path
from datetime import datetime
from typing import List, Tuple, Optional, Dict
from contextlib import contextmanager
import json, gzip, os, time, sys, traceback, logging

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import display

from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import IsolationForest

# ----------------------------
# CONFIG (원하면 아래만 수정)
# ----------------------------
TRAIN_DIR = r"C:\Users\EL59\Documents\final_project\dataset"
TRAIN_START_DATE = "2025-08-01"
TRAIN_END_DATE   = "2025-08-07"
TRAIN_START_HOUR = 10
TRAIN_END_HOUR   = 24        # 24 → 내부적으로 23 포함 처리

TEST_DIR  = r"C:\Users\EL59\Documents\final_project\test"
TEST_START_DATE = "2025-06-01"
TEST_END_DATE   = "2025-06-01"
TEST_START_HOUR = 0
TEST_END_HOUR   = 24

TIMEZONE = "Asia/Seoul"
IGNORE_BOTS = False

SAMPLE_SIZE   = 200_000
CHUNK_SIZE    = 200_000
CONTAMINATION = 0.02
QUANTILE      = 0.98
TOPK          = 300
MAX_PER_ACTOR = 2
SAVE_ALL      = False

# 저장 경로(None이면 자동 이름)
OUT_CSV = None  # 예: r"C:\Users\EL59\Documents\final_project\test\anomalies_2025-06-01.csv"

# 로깅 설정
LOG_LEVEL = logging.INFO          # DEBUG / INFO
LOG_EVERY_N_FILES = 100           # 파일 로딩할 때 N개마다 한 번 로그

# ----------------------------
# LOGGER
# ----------------------------
logging.basicConfig(
    level=LOG_LEVEL,
    format="%(asctime)s | %(levelname)-7s | %(message)s",
    datefmt="%H:%M:%S"
)
logger = logging.getLogger("gh-anom")

@contextmanager
def step_timer(msg: str):
    t0 = time.perf_counter()
    logger.info(f"▶ {msg} ...")
    try:
        yield
    finally:
        dt = time.perf_counter() - t0
        logger.info(f"✅ {msg} 완료 ({dt:,.2f}s)")

def log_df(title: str, df: pd.DataFrame, extra: str = ""):
    logger.info(f"{title}: shape={df.shape}{(' | ' + extra) if extra else ''}")

# ----------------------------
# CONSTANTS / GLOBALS
# ----------------------------
VALID_EXTS = (".json", ".jsonl", ".ndjson", ".gz")

TRUST_MAP = {
    "OWNER": 3, "MEMBER": 3,
    "COLLABORATOR": 2,
    "CONTRIBUTOR": 1,
    "FIRST_TIMER": 0, "FIRST_TIME_CONTRIBUTOR": 0,
    "NONE": 0, None: 1
}
SENSITIVE_TYPES = {"PushEvent","DeleteEvent","CreateEvent","PullRequestEvent","PullRequestReviewEvent"}

HOUR_COUNTS: Optional[Dict[int,int]] = None
HOUR_Q20_THRESHOLD: Optional[int] = None

# ----------------------------
# LOADING UTILS
# ----------------------------
def _first_non_ws_char(f) -> Optional[str]:
    pos = f.tell()
    ch = f.read(1)
    while ch and ch.isspace():
        pos = f.tell()
        ch = f.read(1)
    f.seek(pos if ch else 0)
    return ch if ch else None

def _iter_json_objects(path: Path):
    is_gz = path.suffix.lower() == ".gz"
    opener = (lambda p: gzip.open(p, "rt", encoding="utf-8", errors="ignore")) if is_gz \
             else (lambda p: open(p, "r", encoding="utf-8", errors="ignore"))
    inner_ext = None
    if is_gz:
        stem = path.name[:-3]
        inner_ext = Path(stem).suffix.lower()
    with opener(path) as f:
        first = _first_non_ws_char(f)
        if (not is_gz and first == "[") or (is_gz and inner_ext in (".json",) and first == "["):
            data = json.load(f)
            if isinstance(data, list):
                for obj in data:
                    if isinstance(obj, dict):
                        yield obj
        else:
            for line in f:
                s = line.strip()
                if not s: 
                    continue
                try:
                    obj = json.loads(s)
                    if isinstance(obj, dict):
                        yield obj
                except Exception:
                    continue

def load_events(dir_path: Path) -> pd.DataFrame:
    with step_timer(f"이벤트 로딩 - {dir_path}"):
        rows = []
        files = [p for p in dir_path.rglob("*") if p.is_file() and (p.suffix.lower() in VALID_EXTS or p.suffix.lower()==".gz")]
        logger.info(f"파일 수: {len(files)}개 (유효 확장자만)")
        for i, p in enumerate(tqdm(files, desc="Loading files")):
            try:
                for ev in _iter_json_objects(p):
                    flat = flatten_event(ev)
                    if flat:
                        rows.append(flat)
            except Exception as e:
                logger.debug(f"  파일 스킵(파싱 실패): {p.name} | {e}")
            if (i+1) % LOG_EVERY_N_FILES == 0:
                logger.info(f"  진행 상황: {i+1}/{len(files)} 파일 처리, 누적 행 {len(rows):,}")
        if not rows:
            logger.warning("로딩된 행이 없습니다.")
            return pd.DataFrame()
        df = pd.DataFrame(rows)
        before = len(df)
        if "id" in df.columns:
            df = df.drop_duplicates(subset=["id"], keep="first")
        logger.info(f"중복 제거: {before:,} -> {len(df):,}")
        return df

# ----------------------------
# FLATTEN & HELPERS
# ----------------------------
def safe_get(d: dict, path: List[str], default=None):
    cur = d
    for k in path:
        if isinstance(cur, dict) and (k in cur):
            cur = cur[k]
        else:
            return default
    return cur

def parse_ts(ts) -> Optional[str]:
    if ts is None:
        return None
    if isinstance(ts, (int, float)):
        try:
            return datetime.utcfromtimestamp(int(ts)).isoformat() + "Z"
        except Exception:
            return None
    if isinstance(ts, str):
        return ts
    return None

def is_bot(login: Optional[str]) -> bool:
    if not login or not isinstance(login, str):
        return False
    s = login.lower()
    return s.endswith("[bot]") or s.endswith("-bot") or s.endswith("_bot") or s.endswith("bot")

def flatten_event(ev: dict) -> Optional[dict]:
    try:
        etype = ev.get("type")
        created_at = parse_ts(ev.get("created_at") or safe_get(ev, ["payload","created_at"]))
        actor_login = safe_get(ev, ["actor","login"])
        actor_id = safe_get(ev, ["actor","id"])
        repo_name = safe_get(ev, ["repo","name"])
        repo_id = safe_get(ev, ["repo","id"])
        org_login = safe_get(ev, ["org","login"])
        org_id = safe_get(ev, ["org","id"])

        author_assoc = safe_get(ev, ["payload","pull_request","author_association"])
        if author_assoc is None:
            author_assoc = safe_get(ev, ["payload","comment","author_association"])
        trust_score = TRUST_MAP.get(author_assoc, 1)

        push_size = safe_get(ev, ["payload","size"])
        push_distinct = safe_get(ev, ["payload","distinct_size"])
        push_size = int(push_size) if isinstance(push_size, (int,float)) else 0
        push_distinct = int(push_distinct) if isinstance(push_distinct, (int,float)) else 0

        ref = safe_get(ev, ["payload","ref"])
        title = safe_get(ev, ["payload","issue","title"]) or safe_get(ev, ["payload","pull_request","title"]) or safe_get(ev, ["payload","release","name"])
        body  = safe_get(ev, ["payload","issue","body"])  or safe_get(ev, ["payload","pull_request","body"])  or safe_get(ev, ["payload","release","body"])

        return {
            "id": ev.get("id"),
            "type": etype,
            "created_at": created_at,
            "hour": None,  # tz 변환 후 계산
            "actor_login": actor_login,
            "actor_id": actor_id,
            "actor_is_bot": is_bot(actor_login),
            "repo_name": repo_name,
            "repo_id": repo_id,
            "org_login": org_login,
            "org_id": org_id,
            "author_association": author_assoc,
            "trust_score": trust_score,
            "push_size": push_size,
            "push_distinct": push_distinct,
            "ref": ref if isinstance(ref, str) else None,
            "title_len": len(title) if isinstance(title, str) else 0,
            "body_len": len(body) if isinstance(body, str) else 0,
            "is_sensitive_type": etype in SENSITIVE_TYPES if isinstance(etype, str) else False,
        }
    except Exception:
        return None

# ----------------------------
# FEATURE ENGINEERING (NaN-safe)
# ----------------------------
def add_behavioral_stats(df: pd.DataFrame) -> pd.DataFrame:
    with step_timer("피처 엔지니어링"):
        df = df.copy()

        # 기본 필드 보정
        for col in ["trust_score","push_size","push_distinct","title_len","body_len"]:
            if col not in df.columns: df[col] = 0
        for col in ["actor_is_bot","is_sensitive_type"]:
            if col not in df.columns: df[col] = False
        df[["trust_score","push_size","push_distinct","title_len","body_len"]] = \
            df[["trust_score","push_size","push_distinct","title_len","body_len"]].fillna(0)
        df[["actor_is_bot","is_sensitive_type"]] = df[["actor_is_bot","is_sensitive_type"]].fillna(False)

        # 안전 카운트(transform size): NaN 키도 그룹으로 포함(dropna=False) + fillna(0) + int64
        def _cnt(keys):
            return df.groupby(keys, dropna=False)["id"].transform("size")

        df["actor_events_total"] = _cnt(["actor_login"]).fillna(0).astype("int64")
        df["repo_events_total"]  = _cnt(["repo_name"]).fillna(0).astype("int64")
        df["org_events_total"]   = _cnt(["org_login"]).fillna(0).astype("int64")

        df["actor_repo_events"]  = _cnt(["actor_login","repo_name"]).fillna(0).astype("int64")
        df["actor_org_events"]   = _cnt(["actor_login","org_login"]).fillna(0).astype("int64")
        df["hour_events_total"]  = _cnt(["hour"]).fillna(0).astype("int64")

        # 메인라인/신뢰
        ref_series = df.get("ref", pd.Series(index=df.index, dtype="object")).fillna("").astype(str).str.lower()
        df["ref_is_mainline"] = ref_series.str.contains(r"refs/heads/(main|master|prod|production)", regex=True)
        df["low_trust"] = (df["trust_score"] <= 1)

        # 저장소별 push q90
        def _q90(g):
            if (g > 0).any(): return float(np.quantile(g.values, 0.90))
            return 0.0
        df["repo_push_q90"] = df.groupby("repo_name", dropna=False)["push_size"].transform(_q90).fillna(0.0)

        # 배우-시간대 희소도
        ah = df.groupby(["actor_login","hour"], dropna=False)["id"].transform("size").fillna(0).astype("int64")
        a  = df.groupby(["actor_login"], dropna=False)["id"].transform("size").fillna(0).astype("int64")
        a = a.replace(0, 1)  # 0분모 방지
        df["actor_hour_events"] = ah
        df["actor_hour_ratio"]  = (ah / a).astype(float).fillna(0.0)

        return df

# ----------------------------
# PREPROCESS & TRAIN
# ----------------------------
def build_preprocessor() -> Tuple[ColumnTransformer, List[str]]:
    numeric = ["hour","trust_score","push_size","push_distinct","title_len","body_len",
               "actor_events_total","repo_events_total","org_events_total",
               "actor_repo_events","actor_org_events","hour_events_total",
               "repo_push_q90","actor_hour_ratio"]
    boolean = ["actor_is_bot","is_sensitive_type","ref_is_mainline","low_trust"]
    categorical = ["type"]
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
    pre = ColumnTransformer(
        transformers=[("num", RobustScaler(), numeric),
                      ("bool", "passthrough", boolean),
                      ("cat",  ohe, categorical)],
        remainder="drop"
    )
    feat_cols = numeric + boolean + categorical
    logger.info(f"전처리 피처: num={len(numeric)}, bool={len(boolean)}, cat={len(categorical)} -> total={len(feat_cols)}")
    return pre, feat_cols

def fit_on_sample(df: pd.DataFrame, sample_size=200_000, contamination=0.02, quantile=0.98, random_state=42):
    with step_timer(f"샘플 학습 (sample_size={sample_size:,}, contamination={contamination}, quantile={quantile})"):
        n = len(df); m = min(sample_size, n)
        sample = df.sample(n=m, random_state=random_state) if m < n else df
        log_df("샘플 DF", sample, extra=f"m={m:,}/{n:,}")

        pre, feat_cols = build_preprocessor()

        with step_timer("전처리기 피팅 & 샘플 변환"):
            Xs = pre.fit_transform(sample[feat_cols])

        with step_timer("IsolationForest 학습"):
            model = IsolationForest(n_estimators=300, contamination=contamination,
                                    max_samples='auto', random_state=random_state, n_jobs=-1)
            model.fit(Xs)

        with step_timer("임계값(quantile) 계산"):
            s = -model.score_samples(Xs)  # 값↑ = 이상↑
            thresh = float(np.quantile(s, quantile))
            logger.info(f"임계값(threshold)={thresh:.6f} (샘플 상위 {(quantile*100):.0f} 분위)")

        return pre, model, thresh, feat_cols

# ----------------------------
# SCORING
# ----------------------------
def _apply_cap_and_topk(df_scores: pd.DataFrame, max_per_actor: int, topk: int) -> pd.DataFrame:
    if df_scores.empty: return df_scores
    df_sorted = df_scores.sort_values("_anomaly_score", ascending=False)
    taken, counts = [], {}
    for _, row in df_sorted.iterrows():
        actor = row.get("actor_login", None)
        counts[actor] = counts.get(actor, 0)
        if counts[actor] < max_per_actor:
            taken.append(row); counts[actor] += 1
        if len(taken) >= topk: break
    if not taken: return df_scores.nlargest(topk, "_anomaly_score")
    return pd.DataFrame(taken)

def score_in_chunks(df: pd.DataFrame, pre: ColumnTransformer, model: IsolationForest,
                    feat_cols: List[str], thresh: float, chunk_size: int,
                    topk: int, save_all: bool, max_per_actor: int) -> pd.DataFrame:
    n = len(df)
    logger.info(f"스코어링 시작: rows={n:,}, chunk_size={chunk_size:,}, save_all={save_all}, topk={topk}, max_per_actor={max_per_actor}")
    t0 = time.perf_counter()

    if save_all:
        parts = []
        for idx, st in enumerate(range(0, n, chunk_size), start=1):
            ed = min(st + chunk_size, n)
            with step_timer(f"청크 {idx}: 행 {st:,}~{ed-1:,} 변환/스코어링"):
                ch = df.iloc[st:ed].copy()
                Xt = pre.transform(ch[feat_cols])
                sc = -model.score_samples(Xt)
                mask = sc >= thresh
                kept = int(mask.sum())
                logger.info(f"  임계 이상 유지: {kept:,} / {len(ch):,}")
                if kept > 0:
                    part = ch.loc[mask].copy()
                    part["_anomaly_score"] = sc[mask]
                    parts.append(part)
        if not parts:
            logger.warning("임계 이상 레코드가 없습니다.")
            return pd.DataFrame(columns=list(df.columns)+["_anomaly_score"])
        out = pd.concat(parts, ignore_index=True).sort_values("_anomaly_score", ascending=False)
        logger.info(f"스코어링 완료: 최종 유지 {len(out):,}건, 총 {time.perf_counter()-t0:,.2f}s")
        return out

    buffer = pd.DataFrame(columns=list(df.columns)+["_anomaly_score"])
    for idx, st in enumerate(range(0, n, chunk_size), start=1):
        ed = min(st + chunk_size, n)
        with step_timer(f"청크 {idx}: 행 {st:,}~{ed-1:,} 변환/스코어링/선별"):
            ch = df.iloc[st:ed].copy()
            Xt = pre.transform(ch[feat_cols])
            sc = -model.score_samples(Xt)
            ch["_anomaly_score"] = sc
            keep = ch[ch["_anomaly_score"] >= thresh].copy()
            if keep.empty:
                keep = ch.nlargest(min(100, len(ch)), "_anomaly_score").copy()
                logger.info(f"  임계 이상 없음 → 청크 Top-{len(keep)} 임시 유지")
            else:
                logger.info(f"  임계 이상 유지: {len(keep):,}")
            buffer = pd.concat([buffer, keep], ignore_index=True)
            before = len(buffer)
            buffer = _apply_cap_and_topk(buffer, max_per_actor=max_per_actor, topk=topk)
            after = len(buffer)
            logger.info(f"  버퍼 정리(per-actor cap & Top-K): {before:,} → {after:,}")
    buffer = buffer.sort_values("_anomaly_score", ascending=False).reset_index(drop=True)
    logger.info(f"스코어링 완료: 최종 유지 {len(buffer):,}건, 총 {time.perf_counter()-t0:,.2f}s")
    return buffer

# ----------------------------
# EXPLANATIONS
# ----------------------------
def reason_generator(row: pd.Series) -> List[str]:
    reasons = []
    try:
        if bool(row.get("low_trust", False)) and bool(row.get("is_sensitive_type", False)):
            reasons.append("신뢰 낮음 + 민감 이벤트")
        if bool(row.get("actor_is_bot", False)) and bool(row.get("is_sensitive_type", False)):
            reasons.append("봇 계정 + 민감 이벤트")
        if int(row.get("actor_repo_events", 0)) <= 1 and bool(row.get("is_sensitive_type", False)):
            reasons.append("배우-저장소 상호작용 희소 + 민감 이벤트")
        if bool(row.get("ref_is_mainline", False)) and bool(row.get("low_trust", False)):
            reasons.append("메인라인에서 신뢰 낮음")
        if int(row.get("push_size", 0)) >= float(row.get("repo_push_q90", 0)) and int(row.get("push_size", 0)) > 0:
            reasons.append("푸시 규모가 저장소 q90 이상")
        if float(row.get("actor_hour_ratio", 0.0)) <= 0.05 and bool(row.get("is_sensitive_type", False)):
            reasons.append("배우에게 드문 시간대(≤5%) + 민감 이벤트")
        if int(row.get("trust_score", 0)) >= 2 and int(row.get("actor_repo_events", 0)) <= 1:
            reasons.append("신뢰는 높지만 이 저장소 활동은 이례적")
        global HOUR_COUNTS, HOUR_Q20_THRESHOLD
        if HOUR_COUNTS is not None and HOUR_Q20_THRESHOLD is not None:
            h = int(row.get("hour", -1))
            if 0 <= h <= 23:
                if HOUR_COUNTS.get(h, 0) <= HOUR_Q20_THRESHOLD and bool(row.get("is_sensitive_type", False)):
                    reasons.append("전체적으로 한산한 시간대(하위 분위) + 민감 이벤트")
    except Exception:
        pass
    if not reasons:
        reasons.append("다변량 특징 조합에서 드문 패턴")
    return reasons

# ----------------------------
# TZ & FILTERING
# ----------------------------
def ensure_tz_and_hour(df: pd.DataFrame, tz: str = "Asia/Seoul") -> pd.DataFrame:
    with step_timer(f"타임존 변환 & hour/date 계산 (tz={tz})"):
        if "created_at" not in df.columns: 
            return df
        dt = pd.to_datetime(df["created_at"], errors="coerce", utc=True)
        if tz:
            try: dt = dt.dt.tz_convert(tz)
            except Exception as e:
                logger.warning(f"타임존 변환 실패(UTC 유지): {e}")
        df = df.copy()
        df["created_at"] = dt
        df["hour"] = dt.dt.hour
        df["date"] = dt.dt.date
        nulls = int(df["created_at"].isna().sum())
        if nulls:
            logger.info(f"created_at NaT 레코드 {nulls:,}건 존재")
        return df

def filter_by_date_hour(df: pd.DataFrame, start_date: str, end_date: Optional[str],
                        start_hour: Optional[int], end_hour: Optional[int]) -> pd.DataFrame:
    with step_timer(f"날짜/시간 필터 (date {start_date}~{end_date or start_date}, hour {start_hour}~{end_hour})"):
        if end_date is None: end_date = start_date
        dfrom = pd.to_datetime(start_date).date()
        dto   = pd.to_datetime(end_date).date()
        mask = (df["date"] >= dfrom) & (df["date"] <= dto)
        if start_hour is not None or end_hour is not None:
            sh = 0 if start_hour is None else max(0, min(23, int(start_hour)))
            eh = 23 if end_hour is None else (23 if int(end_hour) >= 24 else max(0, min(23, int(end_hour))))
            mask &= (df["hour"] >= sh) & (df["hour"] <= eh)
        before = len(df)
        out = df.loc[mask].copy()
        logger.info(f"필터링: {before:,} → {len(out):,}")
        return out

def prepare_df(data_dir: str, start_date: str, end_date: Optional[str],
               start_hour: Optional[int], end_hour: Optional[int],
               tz: str, ignore_bots: bool) -> pd.DataFrame:
    df = load_events(Path(data_dir))
    if df.empty: 
        raise ValueError(f"No events loaded from: {data_dir}")
    log_df("로드 직후", df)

    with step_timer("created_at 유효 레코드만 유지"):
        before = len(df)
        df = df.dropna(subset=["created_at"])
        logger.info(f"dropna(created_at): {before:,} → {len(df):,}")

    df = ensure_tz_and_hour(df, tz=tz)
    df = filter_by_date_hour(df, start_date, end_date, start_hour, end_hour)

    if ignore_bots and "actor_is_bot" in df.columns:
        with step_timer("봇 계정 제외"):
            before = len(df)
            df = df[df["actor_is_bot"] != True].copy()
            logger.info(f"봇 제외: {before:,} → {len(df):,}")

    if df.empty:
        raise ValueError("No rows left after filtering (check tz/hours).")

    df = add_behavioral_stats(df)
    log_df("피처 엔지니어링 완료", df)
    return df

# ----------------------------
# RUN (TRAIN → TEST)
# ----------------------------
try:
    with step_timer("TRAIN 데이터 준비"):
        train_df = prepare_df(
            data_dir=TRAIN_DIR,
            start_date=TRAIN_START_DATE,
            end_date=TRAIN_END_DATE,
            start_hour=TRAIN_START_HOUR,
            end_hour=TRAIN_END_HOUR,
            tz=TIMEZONE,
            ignore_bots=IGNORE_BOTS,
        )
        log_df("TRAIN 세트", train_df)

    pre, model, thresh, feat_cols = fit_on_sample(
        train_df,
        sample_size=SAMPLE_SIZE,
        contamination=CONTAMINATION,
        quantile=QUANTILE,
        random_state=42
    )

    with step_timer("TEST 데이터 준비"):
        test_df = prepare_df(
            data_dir=TEST_DIR,
            start_date=TEST_START_DATE,
            end_date=TEST_END_DATE,
            start_hour=TEST_START_HOUR,
            end_hour=TEST_END_HOUR,
            tz=TIMEZONE,
            ignore_bots=IGNORE_BOTS,
        )
        log_df("TEST 세트", test_df)

    # Reason용 시간 분포 (TEST 기준)
    with step_timer("시간대 분포(Reason용) 계산"):
        hour_counts_series = test_df.groupby("hour")["id"].size()
        HOUR_COUNTS = {int(h): int(c) for h, c in hour_counts_series.to_dict().items()}
        HOUR_Q20_THRESHOLD = int(np.quantile(hour_counts_series.values, 0.20)) if len(hour_counts_series)>0 else 0
        logger.info(f"HOUR_COUNTS keys={sorted(HOUR_COUNTS.keys())}, Q20={HOUR_Q20_THRESHOLD}")

    with step_timer("TEST 스코어링(청크)"):
        anomalies = score_in_chunks(
            df=test_df,
            pre=pre,
            model=model,
            feat_cols=feat_cols,
            thresh=thresh,
            chunk_size=CHUNK_SIZE,
            topk=TOPK,
            save_all=SAVE_ALL,
            max_per_actor=MAX_PER_ACTOR,
        )

    if anomalies.empty:
        logger.warning("이상치가 없습니다.")
        anomalies = pd.DataFrame(columns=list(test_df.columns)+["_anomaly_score","reasons"])
    else:
        with step_timer("이상 사유(reasons) 생성"):
            anomalies = anomalies.copy()
            anomalies["reasons"] = anomalies.apply(lambda r: "; ".join(reason_generator(r)), axis=1)

    keep_cols = [
        "id","type","created_at","hour","actor_login","repo_name","org_login",
        "trust_score","push_size","push_distinct",
        "is_sensitive_type","ref_is_mainline","low_trust",
        "actor_events_total","repo_events_total","org_events_total",
        "actor_repo_events","actor_org_events","hour_events_total",
        "repo_push_q90","actor_hour_ratio",
        "_anomaly_score","reasons"
    ]
    for c in keep_cols:
        if c not in anomalies.columns: anomalies[c] = None
    anomalies = anomalies[keep_cols].sort_values("_anomaly_score", ascending=False).reset_index(drop=True)
    log_df("최종 이상치 결과", anomalies)

    # 저장
    with step_timer("CSV 저장"):
        if OUT_CSV is None:
            test_range = f"{TEST_START_DATE}_to_{TEST_END_DATE or TEST_START_DATE}"
            OUT_CSV = str(Path(TEST_DIR) / f"anomalies_{test_range}.csv")
        os.makedirs(Path(OUT_CSV).parent, exist_ok=True)
        anomalies.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
        logger.info(f"Saved → {OUT_CSV}")

    # 미리보기
    display(anomalies.head(20))

except Exception as e:
    logger.error("실행 중 예외 발생!")
    traceback.print_exc()


01:24:57 | INFO    | ▶ TRAIN 데이터 준비 ...
01:24:57 | INFO    | ▶ 이벤트 로딩 - C:\Users\EL59\Documents\final_project\dataset ...
01:24:57 | INFO    | 파일 수: 112개 (유효 확장자만)
Loading files:  88%|████████▊ | 99/112 [07:20<01:05,  5.02s/it]01:32:23 | INFO    |   진행 상황: 100/112 파일 처리, 누적 행 15,496,933
Loading files: 100%|██████████| 112/112 [08:26<00:00,  4.52s/it]
01:49:02 | INFO    | 중복 제거: 17,375,780 -> 17,375,771
01:49:02 | INFO    | ✅ 이벤트 로딩 - C:\Users\EL59\Documents\final_project\dataset 완료 (1,444.64s)
01:50:33 | INFO    | 로드 직후: shape=(17375771, 19)
01:50:33 | INFO    | ▶ created_at 유효 레코드만 유지 ...
01:53:04 | INFO    | dropna(created_at): 17,375,771 → 17,375,771
01:53:04 | INFO    | ✅ created_at 유효 레코드만 유지 완료 (150.64s)
01:53:04 | INFO    | ▶ 타임존 변환 & hour/date 계산 (tz=Asia/Seoul) ...
01:54:35 | INFO    | ✅ 타임존 변환 & hour/date 계산 (tz=Asia/Seoul) 완료 (91.81s)
01:54:46 | INFO    | ▶ 날짜/시간 필터 (date 2025-08-01~2025-08-07, hour 10~24) ...
01:55:02 | INFO    | 필터링: 17,375,771 → 5,580,515
01:55:03 | INFO 

Unnamed: 0,id,type,created_at,hour,actor_login,repo_name,org_login,trust_score,push_size,push_distinct,...,actor_events_total,repo_events_total,org_events_total,actor_repo_events,actor_org_events,hour_events_total,repo_push_q90,actor_hour_ratio,_anomaly_score,reasons
0,50372575946,IssueCommentEvent,2025-06-01 19:45:25+09:00,19,github-actions[bot],hairyhenderson/client_golang,,0,0,0,...,405425,31,1744244,15,351192,142826,3.0,0.05071,0.632733,다변량 특징 조합에서 드문 패턴
1,50372590820,IssueCommentEvent,2025-06-01 19:46:41+09:00,19,github-actions[bot],hairyhenderson/client_golang,,0,0,0,...,405425,31,1744244,15,351192,142826,3.0,0.05071,0.632733,다변량 특징 조합에서 드문 패턴
2,50375906709,PushEvent,2025-06-01 23:57:07+09:00,23,jqdwdty,jqdwdty/wtm_wf,,1,193,193,...,3366,3201,1744244,3201,3366,149665,191.0,0.068033,0.619592,신뢰 낮음 + 민감 이벤트; 메인라인에서 신뢰 낮음; 푸시 규모가 저장소 q90 이상
3,50375330253,PushEvent,2025-06-01 23:16:13+09:00,23,jqdwdty,jqdwdty/wtm_wf,,1,195,195,...,3366,3201,1744244,3201,3366,149665,191.0,0.068033,0.619592,신뢰 낮음 + 민감 이벤트; 메인라인에서 신뢰 낮음; 푸시 규모가 저장소 q90 이상
4,50365912800,PushEvent,2025-06-01 09:54:11+09:00,9,pull[bot],xfangfang/winget-pkgs,,1,9321,9321,...,17772,7,1744244,7,15958,126670,3744.0,0.071179,0.617297,신뢰 낮음 + 민감 이벤트; 봇 계정 + 민감 이벤트; 메인라인에서 신뢰 낮음; 푸...
5,50375400151,PushEvent,2025-06-01 23:21:13+09:00,23,pull[bot],kkpan11/nixpkgs,,1,742,742,...,17772,7,1744244,7,15958,149665,346.0,0.063696,0.617138,신뢰 낮음 + 민감 이벤트; 봇 계정 + 민감 이벤트; 메인라인에서 신뢰 낮음; 푸...
6,50368785171,PullRequestEvent,2025-06-01 14:26:13+09:00,14,dependabot[bot],offsoc/ragflow,,0,0,0,...,64093,6,1744244,4,38900,135126,369.5,0.062597,0.614725,신뢰 낮음 + 민감 이벤트; 봇 계정 + 민감 이벤트
7,50368784912,PullRequestEvent,2025-06-01 14:26:12+09:00,14,dependabot[bot],offsoc/ragflow,,0,0,0,...,64093,6,1744244,4,38900,135126,369.5,0.062597,0.614448,신뢰 낮음 + 민감 이벤트; 봇 계정 + 민감 이벤트
8,50375917199,IssueCommentEvent,2025-06-01 23:57:53+09:00,23,llvmbot,llvm/llvm-project,llvm,3,0,0,...,21,141,151,21,21,149665,0.0,0.333333,0.613851,다변량 특징 조합에서 드문 패턴
9,50365494821,PushEvent,2025-06-01 09:17:50+09:00,9,aqua-bot,aquasecurity/vuln-list,aquasecurity,1,2000,2000,...,9,3,40,3,9,126670,1600.2,0.333333,0.612361,신뢰 낮음 + 민감 이벤트; 봇 계정 + 민감 이벤트; 메인라인에서 신뢰 낮음; 푸...
