# AdaptableSnail

In [None]:
from __future__ import annotations
from typing import Dict, List, Tuple, Any, Optional
import warnings
from dataclasses import dataclass, field
import numpy as np
import pandas as pd
from scipy.ndimage import gaussian_filter1d
from tqdm import tqdm

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
np.seterr(invalid="ignore", divide="ignore")

# =============================================================================
# 1. CONFIGURATION
# =============================================================================
@dataclass
class FeatureConfig:
    """
    Chứa cấu hình tham số (Hyperparameters).
    """
    fps: float = 30.0
    pix_per_cm: float = 1.0
    smooth_sigma: float = 1.0
    use_pairwise: bool = True


# =============================================================================
# 2. AGENT CONTEXT
# =============================================================================
@dataclass
class AgentContext:
    """
    Container chứa dữ liệu đã tiền xử lý của một con chuột.
    Giúp tránh việc tính toán lại vận tốc/gia tốc nhiều lần.
    """
    idx: pd.Index          # Index frame
    pos: np.ndarray        # [F, 2] cm
    vel: np.ndarray        # [F, 2] cm/s
    speed: np.ndarray      # [F, 1] cm/s
    acc: np.ndarray        # [F, 2] cm/s^2
    
    cx: pd.Series          # Series tọa độ X (để dùng rolling)
    cy: pd.Series          # Series tọa độ Y
    speed_series: pd.Series # Series tốc độ
    
    raw_df: Optional[pd.DataFrame] = None # Dữ liệu gốc các bộ phận 


# =============================================================================
# 3. FEATURE EXTRACTOR
# =============================================================================
class FeatureExtractor:
    """
    Class trích xuất đặc trưng hành vi từ dữ liệu tracking.
    """
    def __init__(self, fps: float, pix_per_cm: float, smooth_sigma: float = 1.0, use_pairwise: bool = True):
        # Map tham số từ init vào Config
        self.cfg = FeatureConfig(
            fps=float(fps), 
            pix_per_cm=float(pix_per_cm), 
            smooth_sigma=smooth_sigma,
            use_pairwise=use_pairwise
        )
        
        # Đăng ký các hàm feature sẽ chạy
        self.feature_registry = {
            "kinematics": self._feat_basic_kinematics,
            "multiscale": self._feat_multiscale,
            "long_range": self._feat_long_range,
            "cumulative": self._feat_cumulative,
            "curvature": self._feat_curvature,
            "speed_asym": self._feat_speed_asym,
            "gauss_shift": self._feat_gauss_shift,
            "grooming": self._feat_avoidance_trajectory,
            "pose": self._feat_pose_shape,
            "a": self._feat_submission_temporal,
            "pairwise": self._feat_pairwise
        }

    # --- Helpers ---
    def _scale(self, n_frames_30fps: int) -> int:
        """Quy đổi số frame từ chuẩn 30fps sang fps thực tế của video."""
        return max(1, int(round(n_frames_30fps * self.cfg.fps / 30.0)))

    def _to_cm(self, arr):
        """Chuyển pixel -> cm."""
        return arr / self.cfg.pix_per_cm

    def _smooth(self, x):
        """Làm mượt dữ liệu bằng Gaussian filter."""
        if self.cfg.smooth_sigma is None or x.shape[0] < 3: return x
        if np.all(np.isnan(x)): return x
        return gaussian_filter1d(x, sigma=self.cfg.smooth_sigma, axis=0, mode="nearest")

    def _forward_fill_nan(self, pos):
        """
        Điền dữ liệu thiếu (NaN) bằng giá trị hợp lệ trước đó (Forward Fill).
        """
        if np.all(np.isnan(pos)):
            return np.zeros_like(pos)

        pos_ffill = pos.copy()
        mask = np.any(~np.isnan(pos_ffill), axis=1)
        if not mask.any():
            return np.zeros_like(pos_ffill)

        valid_idx = np.where(mask)[0]
        first, last = valid_idx[0], valid_idx[-1]
        pos_ffill[:first] = pos_ffill[first]
        pos_ffill[last + 1:] = pos_ffill[last]
        df_temp = pd.DataFrame(pos_ffill)
        df_temp = df_temp.ffill()
        return df_temp.to_numpy()
    
    def _speed_series(self, cx: pd.Series, cy: pd.Series) -> pd.Series:
        dx = cx.diff()
        dy = cy.diff()
        v = np.hypot(dx, dy).fillna(0.0) * self.cfg.fps
        return v.astype("float32")
    
    def _roll_future_mean(self, s: pd.Series, w: int, min_p: int = 1) -> pd.Series:
        return s.iloc[::-1].rolling(w, min_periods=min_p).mean().iloc[::-1]

    def _roll_future_var(self, s: pd.Series, w: int, min_p: int = 2) -> pd.Series:
        return s.iloc[::-1].rolling(w, min_periods=min_p).var().iloc[::-1]

    # --- Core Logic ---
    def _compute_kinematics(self, pos_px: np.ndarray):
        """
        Tính toán vật lý cơ bản: Pos(cm), Vel, Speed, Acc.
        Input: Array [Frames, 2] (pixel).
        Output: Tuple (pos_cm, vel, speed, acc).
        """
        pos_ffill = self._forward_fill_nan(pos_px)
        pos_cm = self._to_cm(pos_ffill.astype(np.float32))
        pos_cm = self._smooth(pos_cm)                                               # [F, 2]

        dt = 1.0 / self.cfg.fps
        vel = np.zeros_like(pos_cm, dtype=np.float32)
        vel[1:] = (pos_cm[1:] - pos_cm[:-1]) / dt                                   # [F, 2: (vx, vy)]
        speed = np.linalg.norm(vel, axis=1, keepdims=True).astype(np.float32)       # [F, 1]

        acc = np.zeros_like(pos_cm, dtype=np.float32)                          
        acc[1:] = (vel[1:] - vel[:-1]) / dt                                         # [F, 2:(ax, ay)]
        return pos_cm.astype(np.float32), vel, speed, acc

    def _build_context(self, frames, pos_px, mouse_df=None) -> AgentContext:
        """
        Tạo AgentContext chứa đầy đủ thông tin vật lý của 1 con chuột.
        """
        p, v, s, a = self._compute_kinematics(pos_px)
        idx = pd.Index(frames, name="frame")
        
        return AgentContext(
            idx=idx, pos=p, vel=v, speed=s, acc=a, 
            cx=pd.Series(p[:, 0], index=idx), 
            cy=pd.Series(p[:, 1], index=idx), 
            speed_series=pd.Series(s[:, 0], index=idx), 
            raw_df=mouse_df
        )

    # --- Feature Modules ---
    def _feat_basic_kinematics(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Lấy các giá trị thô: tọa độ x, y, vận tốc vx, vy, tốc độ, gia tốc ax, ay.
        """
        return {
            "a_x": ctx.pos[:, 0], "a_y": ctx.pos[:, 1],
            "a_vx": ctx.vel[:, 0], "a_vy": ctx.vel[:, 1],
            "a_speed": ctx.speed[:, 0],
            "a_ax": ctx.acc[:, 0], "a_ay": ctx.acc[:, 1]
        }

    def _feat_multiscale(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Tính tốc độ trung bình (Mean) và độ lệch chuẩn (Std) ở đa mức thời gian.
        Feature 'sp_ratio' đo độ bùng nổ (Burstiness).
        """
        feats = {}
        speed = ctx.speed_series
        frame_scales = [10, 40, 160]
        for scale in frame_scales:
            ws = self._scale(scale)
            if len(speed) >= ws:
                roller = speed.rolling(ws, min_periods=max(1, ws//4), center=True)
                feats[f"sp_m{scale}"] = roller.mean().astype("float32")
                feats[f"sp_s{scale}"] = roller.std().astype("float32")
        feats[f"sp_ratio"] = feats["sp_m10"] / (feats["sp_m160"] + 1e-6)
        return feats 
        
    def _feat_long_range(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Đặc trưng ngữ cảnh dài hạn:
        - x_ml, y_ml: Vị trí trung bình trong quá khứ.
        - sp_pct: Xếp hạng (percentile) của tốc độ hiện tại so với quá khứ.
        """
        feats: Dict[str, pd.Series] = {}
        speed = ctx.speed_series

        for window in [120, 240]:
            ws = self._scale(window)
            if len(ctx.cx) >= ws:
                feats[f"x_ml{window}"] = ctx.cx.rolling(ws, min_periods=max(5, ws // 6), center=True).mean()
                feats[f"y_ml{window}"] = ctx.cy.rolling(ws, min_periods=max(5, ws // 6), center=True).mean()

        for span in [60, 120]:
            s = self._scale(span)
            feats[f"x_e{span}"] = ctx.cx.ewm(span=s, min_periods=1).mean()
            feats[f"y_e{span}"] = ctx.cy.ewm(span=s, min_periods=1).mean()

        for window in [60, 120]:
            ws = self._scale(window)
            if len(speed) >= ws:
                feats[f"sp_pct{window}"] = speed.rolling(
                    ws, min_periods=max(5, ws // 6), center=True
                ).rank(pct=True)
        return feats
    

    def _feat_curvature(self, ctx: AgentContext, **kwargs) -> Dict:
        feats = {}

        vel_x, vel_y = ctx.vel[:, 0], ctx.vel[:, 1]
        acc_x, acc_y = ctx.acc[:, 0], ctx.acc[:, 1]
        cross_prod = vel_x * acc_y - vel_y * acc_x
        vel_mag = np.sqrt(vel_x**2 + vel_y**2)
        moving_mask = vel_mag > 2.0
        vel_mag_safe = np.maximum(vel_mag, 0.1 / self.cfg.fps)
        raw_curv = cross_prod / (vel_mag_safe**3)
        raw_curv = np.where(moving_mask, raw_curv, 0.0)
        min_turn_radius_cm = 0.5
        max_k = 1.0 / min_turn_radius_cm
        raw_curv = np.clip(raw_curv, -max_k, max_k)
        abs_curv = np.abs(raw_curv)
        abs_curv_series = pd.Series(abs_curv, index=ctx.idx)

        for w in [30, 60]:
            ws = self._scale(w)
            min_p = max(ws // 3, 1)
            feats[f"curv_mean_{w}"] = abs_curv_series.rolling(ws, min_periods=min_p).mean()

        angle = np.arctan2(vel_y, vel_x)
        angle_series = pd.Series(angle, index=ctx.idx)
        angle_change = np.abs(angle_series.diff().fillna(0.0))
        angle_change = np.where(angle_change > np.pi, 2 * np.pi - angle_change, angle_change)
        angle_change_series = pd.Series(angle_change, index=ctx.idx)
        angle_change_series = pd.Series(np.where(moving_mask, angle_change_series, 0.0), index=ctx.idx)

        ws = self._scale(30)
        feats["turn_rate_30"] = angle_change_series.rolling(ws, min_periods=max(ws // 3, 1)).sum()

        return feats
    
    def _feat_cumulative(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Tổng quãng đường di chuyển trong một khoảng thời gian dài xung quanh frame hiện tại.
        """
        feats = {}
        L = max(1, self._scale(180))
        step = np.hypot(ctx.cx.diff(), ctx.cy.diff()).fillna(0.0)
        path = step.rolling(2 * L + 1, min_periods=max(5, L // 6), center=True).sum()
        feats["path_cum180"] =  path.fillna(0.0).astype("float32")
        return feats

    def _feat_speed_asym(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Bất đối xứng tốc độ (Tương lai - Quá khứ).
        """
        w = max(3, self._scale(30))
        v = ctx.speed_series
        v_past = v.rolling(w, min_periods=1).mean()
        v_fut = self._roll_future_mean(v, w, min_p=1)
        return {"spd_asym_1s": (v_fut - v_past).fillna(0.0)}
    
    def _feat_gauss_shift(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Độ lệch Gaussian (KL Divergence) giữa quá khứ và tương lai.
        Đo lường sự thay đổi trạng thái thống kê.
        """
        w = max(5, self._scale(30))
        v = ctx.speed_series
        mu_p = v.rolling(w, min_periods=1).mean()
        va_p = v.rolling(w, min_periods=1).var().clip(lower=1e-6)
        mu_f = self._roll_future_mean(v, w, min_p=1)
        va_f = self._roll_future_var(v, w, min_p=1).clip(lower=1e-6)

        kl_pf = 0.5 * (
            (va_p / va_f) + ((mu_f - mu_p) ** 2) / va_f - 1.0 + np.log(va_f / va_p)
        )
        kl_fp = 0.5 * (
            (va_f / va_p) + ((mu_p - mu_f) ** 2) / va_p - 1.0 + np.log(va_p / va_f)
        )
        return {
            "spd_symkl_1s": (kl_pf + kl_fp).replace([np.inf, -np.inf], np.nan).fillna(0.0)
        }

    def _feat_avoidance_trajectory(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        """
        Tính toán quỹ đạo né tránh:
        1. Relative Heading: Góc di chuyển so với hướng tới đối thủ.
        2. Future Distance Gain: Dự báo xem hành động này có giúp chuột ra xa đối thủ trong tương lai không.
        """
        feats = {}
        if target_ctx is None: 
            return feats

        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
        rel_vec = target_ctx.pos - ctx.pos
        angle_to_target = np.arctan2(rel_vec[:, 1], rel_vec[:, 0])
        my_heading = np.arctan2(ctx.vel[:, 1], ctx.vel[:, 0])
        
        diff = np.abs(angle_to_target - my_heading)
        diff = np.minimum(diff, 2*np.pi - diff) # Chuẩn hóa về [0, pi]
        feats["heading_rel_cos"] = pd.Series(np.cos(diff), index=idx, dtype="float32")
        
        feats["heading_rel_abs"] = pd.Series(diff, index=idx, dtype="float32")
        dist_now = np.linalg.norm(rel_vec, axis=1)
        s_dist = pd.Series(dist_now, index=idx)
        
        scales = [15, 30] # 0.5s và 1s
        for w in scales:
            ws = self._scale(w)
            dist_future = s_dist.shift(-ws)
            gain = dist_future - s_dist
            
            feats[f"dist_gain_{w}f"] = gain.fillna(0.0).astype("float32")

        return feats
    
    def _extract_part(self, ctx: AgentContext, part: str) -> Optional[np.ndarray]:
        if ctx.raw_df is None: return None
        if part not in ctx.raw_df.columns.get_level_values(0): return None
        try:
            sub_df = ctx.raw_df.xs(part, axis=1, level=0)[["x", "y"]].reindex(ctx.idx)
        except KeyError: return None
        raw = sub_df.to_numpy()
        raw = self._forward_fill_nan(raw)
        cm = self._to_cm(raw.astype(np.float32))
        return self._smooth(cm)
    
    def _extract_parts_dict(self, ctx: AgentContext, parts: List[str] = None) -> Dict[str, Optional[np.ndarray]]:
        out = {}
        for p in parts:
            out[p] = self._extract_part(ctx, p)
        return out
        
    def _feat_pose_shape(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Placeholder cho các đặc trưng hình dáng (Elongation, Body Angle...).
        """
        feats = {}

        def zero(): return pd.Series(0.0, index=ctx.idx, dtype="float32")

        def dist(k1, k2):
            p1, p2 = parts.get(k1), parts.get(k2)
            if p1 is None or p2 is None: return zero()
            d = np.linalg.norm(p1 - p2, axis=1)
            return pd.Series(d, index=ctx.idx, dtype="float32")
        
        def body_angle():
            if parts.get("nose") is None: return zero()
            if parts.get("body_center") is None: return zero()
            if parts.get("tail_base") is None: return zero()

            v1 = parts.get("nose") - parts.get("body_center")
            v2 = parts.get("tail_base") - parts.get("body_center")
            dot_product = np.sum(v1 * v2, axis=1)
            mag = np.linalg.norm(v1, axis=1) * np.linalg.norm(v2, axis=1)
            cos_angle = np.clip(dot_product / (mag + 1e-6), -1.0, 1.0).astype("float32")
            return cos_angle
        
        def elongation():
            if parts.get("nose")          is None: return zero()
            if parts.get("tail_base")     is None: return zero()
            if parts.get("lateral_left")  is None: return zero()
            if parts.get("lateral_right") is None: return zero()

            d1 = dist("nose", "tail_base")
            d2 = dist("lateral_left", "lateral_right")
            elongation = d1 / (d2 + 1e-6).astype("float32")
            return elongation
        
        def vel(part: str, n_frames_30fps: int) -> Dict:
            part_pos = self._extract_part(ctx, part)
            if part_pos is None: return zero()
            
            s_x = pd.Series(part_pos[:, 0], index=ctx.idx)
            s_y = pd.Series(part_pos[:, 1], index=ctx.idx)
            raw_speed = self._speed_series(s_x, s_y)

            ws = self._scale(n_frames_30fps)
            val = raw_speed.rolling(ws, min_periods=1, center=True).mean()
            return val.astype("float32")


        target_parts = ["nose", "neck", "body_center", "tail_base", 
                        "ear_left", "ear_right", 
                        "lateral_left", "lateral_right", "tail_midpoint", "tail_tip"]
        
        parts = self._extract_parts_dict(ctx, target_parts)

        feats["a_body_width"]                = dist("lateral_left", "lateral_right")
        feats["aa_nose_bodycenter_dist"]     = dist("nose", "body_center")
        feats["aa_nose_tailbase_dist"]       = dist("nose", "tail_base")
        feats["aa_bodycenter_tailbase_dist"] = dist("body_center", "tail_base")
        
        feats["aa_bodycenter_ear_l_dist"]    = dist("body_center", "ear_left")
        feats["aa_bodycenter_ear_r_dist"]    = dist("body_center", "ear_right")
        feats["aa_bodycenter_lateral_l_dist"]= dist("body_center", "lateral_left")
        feats["aa_bodycenter_lateral_r_dist"]= dist("body_center", "lateral_right")
        
        feats["a_body_angle"]                = body_angle()
        feats["a_elongation"]                = elongation()
        feats["a_tail_base_vel_500ms"]       = vel("tail_base", 15)
        feats["a_tail_base_vel_1000ms"]      = vel("tail_base", 30)
        feats["a_tail_base_vel_2000ms"]      = vel("tail_base", 60)
        feats["a_tail_base_vel_3000ms"]      = vel("tail_base", 90)
        feats["a_nose_vel_500ms"]            = vel("nose", 15)
        feats["a_nose_vel_1000ms"]           = vel("nose", 30)
        feats["a_nose_vel_2000ms"]           = vel("nose", 60)
        feats["a_nose_vel_3000ms"]           = vel("nose", 90)
        feats["a_ear_right_vel_500ms"]       = vel("ear_right", 15)
        feats["a_ear_right_vel_1000ms"]      = vel("ear_right", 30)
        feats["a_ear_right_vel_2000ms"]      = vel("ear_right", 60)
        feats["a_ear_right_vel_3000ms"]      = vel("ear_right", 90)
        # len_1 = dist("tail_base", "tail_midpoint")
        # len_2 = dist("tail_midpoint", "tail_tip")
        # len_full = dist("tail_base", "tail_tip")
        # feats["tail_curl"] = ((len_1 + len_2) / (len_full + 1e-6)).astype("float32")
        return feats

    def _feat_submission_temporal(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        """
        Đặc trưng 'Ký ức sợ hãi' (Fear Memory) để bắt Submit tĩnh.
        Giúp phân biệt Submit (sau khi bị đánh) vs Rest (bình yên).
        """
        feats = {}
        if target_ctx is None: return feats
        
        idx = ctx.idx
        vec_to_target = target_ctx.pos - ctx.pos
        dist = np.linalg.norm(vec_to_target, axis=1)
        dist_safe = pd.Series(dist, index=idx).replace(0, 1e-6)
        t_vel = target_ctx.vel
        dot_threat = np.sum(t_vel * (-vec_to_target), axis=1)
        
        threat_raw = (dot_threat / dist_safe).clip(lower=0) 
        threat_raw = threat_raw * (dist_safe < 15.0).astype(float)
        threat_series = pd.Series(threat_raw, index=idx, dtype="float32")
        ws_memory = self._scale(90)
        
        feats["fear_memory_3s"] = threat_series.rolling(ws_memory, min_periods=1).max().astype("float32")
        my_speed = ctx.speed_series
        is_still = (my_speed < 1.0).astype(float)
        parts = self._extract_parts_dict(ctx, ["nose", "tail_base"])
        if parts["nose"] is not None:
            spine_len = np.linalg.norm(parts["nose"] - parts["tail_base"], axis=1)
            is_compact = (spine_len < 8.0).astype(float) # Ví dụ chuột dài < 8cm là co
            is_compact = pd.Series(is_compact, index=idx)
        else:
            is_compact = pd.Series(0.0, index=idx)
        feats["static_submit_prob"] = (
            is_still * is_compact * feats["fear_memory_3s"]
        ).astype("float32")

        return feats

    


    def _feat_pairwise(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict:
        """
        Đặc trưng tương tác cặp đôi (Pairwise): Khoảng cách, Tốc độ tiếp cận.
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None: 
            return feats

        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")

        # --- 1. KHOẢNG CÁCH CƠ BẢN (DISTANCES) ---
        # Vector nối Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        dist = np.linalg.norm(rel_vec, axis=1)
        feats["rel_dist"] = pd.Series(dist, index=idx, dtype="float32")

        # --- 2. KHOẢNG CÁCH CHI TIẾT (NOSE-TO-PART) ---
        # Lấy các bộ phận quan trọng
        my_parts = self._extract_parts_dict(ctx, ["nose", "neck"])
        target_parts = self._extract_parts_dict(target_ctx, 
            ["nose", "tail_base", "body_center", "ear_left", "ear_right", 
             "lateral_left", "lateral_right", "tail_midpoint"])

        def dist_ab(pt_a, pt_b):
            if pt_a is None or pt_b is None: return zero()
            d = np.linalg.norm(pt_a - pt_b, axis=1)
            return pd.Series(d, index=idx, dtype="float32")

        an, tn = my_parts["nose"], target_parts["nose"]
        feats["dist_nose_nose"] = dist_ab(an, tn)
        feats["dist_nose_tail"] = dist_ab(an, target_parts["tail_base"])
        feats["dist_nose_body"] = dist_ab(an, target_parts["body_center"])
        feats["dist_nose_el"]   = dist_ab(an, target_parts["ear_left"])
        feats["dist_nose_er"]   = dist_ab(an, target_parts["ear_right"])
        feats["dist_nose_tll"]  = dist_ab(an, target_parts["lateral_left"])
        feats["dist_nose_tlr"]  = dist_ab(an, target_parts["lateral_right"])
        feats["dist_nose_tmp"]  = dist_ab(an, target_parts["tail_midpoint"])

        # --- 3. ĐỊNH HƯỚNG & GÓC NHÌN (ORIENTATION & GAZE) ---
        # Helper lấy vector cơ thể (Mũi - Đuôi/Thân)
        def get_body_vec(parts_dict):
            head = parts_dict.get("nose")
            # Ưu tiên đuôi, nếu ko có thì dùng thân
            tail = parts_dict.get("tail_base")
            if tail is None: tail = parts_dict.get("body_center") # Fallback
            
            if head is not None and tail is not None:
                return head - tail
            return None

        a_vec = get_body_vec(my_parts)
        t_vec = get_body_vec(target_parts)

        # A. Body Cosine: Hai con cùng chiều hay ngược chiều?
        if a_vec is not None and t_vec is not None:
            dot = np.sum(a_vec * t_vec, axis=1)
            mags = np.linalg.norm(a_vec, axis=1) * np.linalg.norm(t_vec, axis=1)
            feats["body_cosine"] = pd.Series(
                np.clip(dot / (mags + 1e-6), -1.0, 1.0), index=idx, dtype="float32"
            )
        else:
            feats["body_cosine"] = zero()

        # B. Gaze Cosine: Tôi có đang nhìn về phía Target không?
        # Vector ánh nhìn = Target_Pos - My_Pos = rel_vec
        if a_vec is not None:
            dot_gaze = np.sum(a_vec * rel_vec, axis=1)
            mag_a = np.linalg.norm(a_vec, axis=1)
            # dist đã tính ở bước 1
            feats["gaze_cosine"] = pd.Series(
                np.clip(dot_gaze / (mag_a * dist + 1e-6), -1.0, 1.0),
                index=idx, dtype="float32"
            )
        else:
            feats["gaze_cosine"] = zero()

        # --- 4. PHÂN RÃ VẬN TỐC (VELOCITY DECOMPOSITION) - CHÌA KHÓA CHO AVOID/ESCAPE ---
        # Vector đơn vị hướng về địch (u)
        dist_safe = dist.copy()
        dist_safe[dist_safe == 0] = 1e-6
        u_vec = rel_vec / dist_safe[:, None]

        # a_vel và t_vel lấy từ Context
        a_vel, t_vel = ctx.vel, target_ctx.vel

        # A. Approach Speed (Vận tốc dọc trục nối 2 con)
        # Dương: Lao vào nhau | Âm: Chạy ra xa nhau
        a_along = np.sum(a_vel * u_vec, axis=1)
        t_along = np.sum(t_vel * (-u_vec), axis=1) # Target hướng ngược lại
        rel_along = np.sum((a_vel - t_vel) * u_vec, axis=1)

        # B. Lateral Speed (Vận tốc ngang - Vuông góc trục nối)
        # Vector chiếu: v_proj = (v . u) * u
        a_proj = a_along[:, None] * u_vec
        a_lat_vec = a_vel - a_proj
        a_lat_speed = np.linalg.norm(a_lat_vec, axis=1)

        feats["approach_speed_agent"]  = pd.Series(a_along, index=idx, dtype="float32")
        feats["approach_speed_target"] = pd.Series(t_along, index=idx, dtype="float32")
        feats["approach_speed_rel"]    = pd.Series(rel_along, index=idx, dtype="float32")
        feats["lateral_speed_agent"]   = pd.Series(a_lat_speed, index=idx, dtype="float32")
        return feats


    # --- Methods tương thích ---
    
    def build_pose_tensor(self, tracking: pd.DataFrame):
        """
        Chuyển dữ liệu tracking (DataFrame) sang Tensor [Frames, Mice, 2] và Dict chi tiết.
        """
        tracking = tracking.sort_values("video_frame")
        frames = np.sort(tracking["video_frame"].unique())
        
        pvid = tracking.pivot(
            index="video_frame", 
            columns=["mouse_id", "bodypart"], 
            values=["x", "y"]
        )
        pvid = pvid.reorder_levels([1, 2, 0], axis=1).sort_index(axis=1).astype("float32")
        mouse_ids = list(pvid.columns.get_level_values(0).unique())
        pos = np.full((len(frames), len(mouse_ids), 2), np.nan, dtype=np.float32)
        per_mouse_df = {}
        
        for i, mid in enumerate(mouse_ids):
            single = pvid[mid]
            per_mouse_df[mid] = single
            
            if "body_center" in single.columns.get_level_values(0):
                cx = single["body_center"]["x"]
                cy = single["body_center"]["y"]
            else:
                cx = single.xs("x", level=1, axis=1).mean(axis=1)
                cy = single.xs("y", level=1, axis=1).mean(axis=1)
            
            pos[:, i, 0] = cx.reindex(frames).values
            pos[:, i, 1] = cy.reindex(frames).values
            
        return frames, mouse_ids, pos, per_mouse_df

    def extract_agent_target(
        self, 
        frames: np.ndarray, 
        mouse_ids: List[Any], 
        pos: np.ndarray, 
        agent_id: Any, 
        target_id: Any, 
        per_mouse_df: Dict = None
    ) -> pd.DataFrame:
        """
        Trích xuất đặc trưng cho cặp (Agent, Target).
        """
        try:
            aid_idx = mouse_ids.index(agent_id)
        except ValueError:
            return pd.DataFrame() 

        # 1. Build Agent Context
        ctx_agent = self._build_context(
            frames, 
            pos[:, aid_idx, :], 
            per_mouse_df.get(agent_id) if per_mouse_df else None
        )

        # 2. Build Target Context
        ctx_target = None
        if self.cfg.use_pairwise and target_id is not None and target_id in mouse_ids:
             tid_idx = mouse_ids.index(target_id)
             ctx_target = self._build_context(
                 frames, 
                 pos[:, tid_idx, :], 
                 per_mouse_df.get(target_id) if per_mouse_df else None
             )

        # 3. Run all features
        all_data = {}
        for func_name, func in self.feature_registry.items():
            out_dict = func(ctx_agent, target_ctx=ctx_target)
            all_data.update(out_dict)

        df_out = pd.DataFrame(all_data, index=ctx_agent.idx)
        df_out = df_out.replace([np.inf, -np.inf], np.nan).fillna(0.0)
        
        return df_out.reindex(sorted(df_out.columns), axis=1)


from __future__ import annotations
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import gc
import itertools
import json
import time
import warnings
import joblib
from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedGroupKFold

# === IMPORT MODEL & OPTUNA ===
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
import optuna

# Cấu hình
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings('ignore', category=UserWarning)
np.seterr(invalid="ignore", divide="ignore")

# Metric
import sys
sys.path.append("/kaggle/usr/lib/mabe-f-beta")
try:
    from metric import score
except ImportError:
    def score(*args, **kwargs): return 0.0

# =========================================================
# 1. CẤU HÌNH & SEED
# =========================================================
SEED = 42
def seed_everything(seed=42):
    np.random.seed(seed)
seed_everything(SEED)

INPUT_DIR = Path("/kaggle/input/MABe-mouse-behavior-detection")
TRAIN_TRACKING_DIR = INPUT_DIR / "train_tracking"
TRAIN_ANNOTATION_DIR = INPUT_DIR / "train_annotation"
TEST_TRACKING_DIR = INPUT_DIR / "test_tracking"

WORKING_DIR = Path("/kaggle/working")
RESULTS_DIR = Path(r"/kaggle/input/results-ensemble-optuna")
RESULTS_DIR.mkdir(exist_ok=True, parents=True)

SELF_BEHAVIORS = ["biteobject", "climb", "dig", "exploreobject", "freeze", "genitalgroom", "huddle", "rear", "rest", "run", "selfgroom"]
PAIR_BEHAVIORS = ["allogroom", "approach", "attack", "attemptmount", "avoid", "chase", "chaseattack", "defend", "disengage", "dominance", "dominancegroom", "dominancemount", "ejaculate", "escape", "flinch", "follow", "intromit", "mount", "reciprocalsniff", "shepherd", "sniff", "sniffbody", "sniffface", "sniffgenital", "submit", "tussle"]
BAD_VIDEOS = []

# =========================================================
# 2. DATA LOADING & PREPARATION (NO CACHE)
# =========================================================

def load_metadata() -> pd.DataFrame:
    return pd.read_csv(INPUT_DIR / "train.csv")

def get_video_params(video_id: Any, meta: pd.DataFrame) -> Tuple[float, float]:
    row = meta.loc[meta["video_id"] == video_id]
    if row.empty: return 30.0, 1.0
    row = row.iloc[0]
    return float(row["frames_per_second"]), float(row["pix_per_cm_approx"])

def load_tracking(lab_id: str, video_id: Any, is_test=False) -> pd.DataFrame:
    d = TEST_TRACKING_DIR if is_test else TRAIN_TRACKING_DIR
    path = d / str(lab_id) / f"{video_id}.parquet"
    if not path.exists(): raise FileNotFoundError(path)
    return pd.read_parquet(path)

def load_annotation(lab_id: str, video_id: Any) -> pd.DataFrame:
    path = TRAIN_ANNOTATION_DIR / str(lab_id) / f"{video_id}.parquet"
    if not path.exists(): return pd.DataFrame(columns=["agent_id", "target_id", "action", "start_frame", "stop_frame"])
    return pd.read_parquet(path)[["agent_id", "target_id", "action", "start_frame", "stop_frame"]]

# Hàm lấy feature KHÔNG CACHE để tránh tràn RAM
def get_frame_features_no_cache(lab_id, video_id, agent_id, target_id, meta, is_test=False):
    if is_test:
        row = meta[meta["video_id"] == video_id].iloc[0]
        fps, pix = float(row["frames_per_second"]), float(row["pix_per_cm_approx"])
        pix = pix if np.isfinite(pix) and pix > 0 else 1.0
    else:
        fps, pix = get_video_params(video_id, meta)

    tracking = load_tracking(lab_id, video_id, is_test)
    
    # === GỌI CLASS FeatureExtractor (Đã có ở cell trước) ===
    fe = FeatureExtractor(fps=fps, pix_per_cm=pix, smooth_sigma=1.0, use_pairwise=True)
    
    frames, mouse_ids, pos, per_mouse_df = fe.build_pose_tensor(tracking)
    
    features_df = fe.extract_agent_target(
        frames=frames, mouse_ids=mouse_ids, pos=pos,
        agent_id=agent_id, target_id=target_id, per_mouse_df=per_mouse_df
    )
    features_df.index = frames
    return frames, features_df

def build_frame_dataset_for_lab_behavior(lab_id, behavior, train_meta, mode="self"):
    videos = train_meta[train_meta["lab_id"] == lab_id]["video_id"].unique().tolist()
    index_list, feature_list, label_list = [], [], []

    for video_id in videos:
        ann = load_annotation(lab_id, video_id)
        if ann.empty: continue
        
        ann_bhv = ann[ann["action"] == behavior]
        if ann_bhv.empty: continue

        pairs = ann_bhv[["agent_id", "target_id"]].drop_duplicates().values.tolist()
        for (agent_id, target_id) in pairs:
            target_id_use = agent_id if mode == "self" else target_id
            
            # Lấy features (tính trực tiếp)
            frames, feat_df = get_frame_features_no_cache(lab_id, video_id, agent_id, target_id_use, train_meta)

            ann_pair = ann_bhv[(ann_bhv["agent_id"] == agent_id) & (ann_bhv["target_id"] == target_id)]
            if ann_pair.empty and mode == "self": ann_pair = ann_bhv[ann_bhv["agent_id"] == agent_id]

            pos_frames = set()
            for _, r in ann_pair.iterrows(): pos_frames.update(range(int(r["start_frame"]), int(r["stop_frame"])))
            
            if not pos_frames: continue
            label = np.isin(frames, list(pos_frames)).astype("int8")
            if label.sum() == 0: continue

            # Lưu vào list và reset index ngay để giảm memory overhead
            index_list.append(pd.DataFrame({"video_id": video_id, "agent_id": agent_id, "target_id": target_id, "video_frame": frames}))
            feature_list.append(feat_df.reset_index(drop=True))
            label_list.append(label)
            
            # Dọn dẹp ngay
            del frames, feat_df, label

    if not index_list: return pd.DataFrame(), pd.DataFrame(), np.zeros(0, dtype="int8")
    
    return pd.concat(index_list, ignore_index=True), pd.concat(feature_list, ignore_index=True), np.concatenate(label_list).astype("int8")

# =========================================================
# 3. TRAINING & ENSEMBLE HELPERS
# =========================================================

def train_catboost_fold(X_tr, y_tr, X_va, y_va, sw=1.0):
    p = {
        'iterations': 1000, 'learning_rate': 0.05, 'depth': 6, 'scale_pos_weight': sw,
        'task_type': 'GPU', 'devices': '0', 'verbose': 0, 'allow_writing_files': False,
        'l2_leaf_reg': 5, 'bootstrap_type': 'Bernoulli', 'subsample': 0.8, 'random_seed': SEED
    }
    m = cb.CatBoostClassifier(**p)
    m.fit(cb.Pool(X_tr, y_tr), eval_set=cb.Pool(X_va, y_va), early_stopping_rounds=20, use_best_model=True)
    return m

def train_lightgbm_fold(X_tr, y_tr, X_va, y_va, sw=1.0):
    p = {
        'objective': 'binary', 'metric': 'binary_logloss', 'learning_rate': 0.05,
        'max_depth': 6, 'num_leaves': 31, 'scale_pos_weight': sw, 'device': 'gpu',
        'verbosity': -1, 'min_child_weight': 5, 'subsample': 0.8, 'colsample_bytree': 0.8,
        'subsample_freq': 1, 'seed': SEED
    }
    m = lgb.train(p, lgb.Dataset(X_tr, y_tr), 1000, valid_sets=[lgb.Dataset(X_va, y_va)], callbacks=[lgb.early_stopping(20, verbose=False)])
    return m

def optimize_ensemble_weights(oof_dict, y_true):
    models = list(oof_dict.keys())
    def obj(trial):
        w = [trial.suggest_float(m, 0.0, 1.0) for m in models]
        s = sum(w) + 1e-6; w = [x/s for x in w]
        p = np.zeros_like(y_true, dtype=float)
        for i, m in enumerate(models): p += oof_dict[m] * w[i]
        th = trial.suggest_float("th", 0.1, 0.9)
        return f1_score(y_true, (p >= th).astype(int), zero_division=0)
    
    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=SEED))
    study.optimize(obj, n_trials=50)
    best = study.best_params
    th = best.pop("th")
    rw = [best[m] for m in models]; s = sum(rw)+1e-6
    return {m: w/s for m, w in zip(models, rw)}, th

def train_validate_ensemble(lab_id, behavior, indices, features, labels):
    res_dir = RESULTS_DIR / lab_id / behavior
    res_dir.mkdir(parents=True, exist_ok=True)

    if len(labels) == 0 or labels.sum() == 0: return 0.0

    X = features.values.astype("float32")
    y = labels.astype("int8")
    groups = indices["video_id"].values
    
    oof_preds = {m: np.zeros(len(y), dtype="float32") for m in ["xgb", "cat", "lgb"]}
    folds = np.ones(len(y), dtype="int8") * -1

    cv = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=SEED)
    for fold, (tr_idx, va_idx) in enumerate(cv.split(X, y, groups=groups)):
        print(f"   Fold {fold}...", end=" ")
        fd_dir = res_dir / f"fold_{fold}"; fd_dir.mkdir(parents=True, exist_ok=True)
        X_tr, y_tr = X[tr_idx], y[tr_idx]; X_va, y_va = X[va_idx], y[va_idx]
        pos = y_tr.sum(); neg = len(y_tr) - pos
        sw = float(neg/pos) if pos > 0 else 1.0

        # 1. XGBoost
        dtr = xgb.QuantileDMatrix(X_tr, label=y_tr, feature_names=features.columns.tolist(), max_bin=64)
        dva = xgb.DMatrix(X_va, label=y_va, feature_names=features.columns.tolist())
        xp = {
            "objective":"binary:logistic", "eval_metric":"logloss", "device":"cuda", 
            "tree_method":"hist", "learning_rate":0.05, "max_depth":6, "scale_pos_weight":sw,
            "min_child_weight":5, "subsample":0.8, "colsample_bytree":0.8, "max_bin":64, "seed": SEED
        }
        
        # === ĐÃ THÊM 'evals=' VÀO DÒNG DƯỚI ===
        mx = xgb.train(
            params=xp, 
            dtrain=dtr, 
            num_boost_round=1000, 
            evals=[(dva, "valid")],
            callbacks=[xgb.callback.EarlyStopping(rounds=20, save_best=True)], 
            verbose_eval=False
        )
        mx.save_model(fd_dir / "model_xgb.json")
        oof_preds["xgb"][va_idx] = mx.predict(dva)

        # 2. CatBoost
        mc = train_catboost_fold(X_tr, y_tr, X_va, y_va, sw)
        mc.save_model(str(fd_dir / "model_cat.cbm"))
        oof_preds["cat"][va_idx] = mc.predict_proba(X_va)[:,1]

        # 3. LightGBM
        ml = train_lightgbm_fold(X_tr, y_tr, X_va, y_va, sw)
        ml.save_model(fd_dir / "model_lgb.txt")
        oof_preds["lgb"][va_idx] = ml.predict(X_va)
        folds[va_idx] = fold
        
        print("Done.")
        del X_tr, y_tr, X_va, y_va, dtr, dva, mx, mc, ml
        gc.collect()

    print("   Optimizing Weights...", end=" ")
    weights, th = optimize_ensemble_weights(oof_preds, y)
    with open(res_dir / "ensemble_params.json", "w") as f: json.dump({"weights": weights, "threshold": th}, f)
    
    final_pred = sum(oof_preds[m] * weights[m] for m in weights)
    final_lbl = (final_pred >= th).astype("int8")
    
    # Save OOF
    df = indices.copy(); df["fold"] = folds; df["pred"] = final_pred; df["lbl"] = final_lbl
    df.to_parquet(res_dir / "oof.parquet", index=False)
    
    f1 = f1_score(y, final_lbl, zero_division=0)
    print(f"Best F1: {f1:.4f} (Th={th:.2f}, W={weights})")
    (res_dir / "f1.txt").write_text(f"{f1:.6f}")
    return float(f1)

# =========================================================
# 4. INFERENCE
# =========================================================

def load_ensemble_models(lab_id, behavior):
    base = RESULTS_DIR / lab_id / behavior
    if not base.exists(): return []
    models = []
    for fd in sorted(base.glob("fold_*")):
        if not (fd / "model_xgb.json").exists(): continue
        
        xgb_b = xgb.Booster(); xgb_b.load_model(str(fd / "model_xgb.json"))
        cat_m = cb.CatBoostClassifier(); 
        try: cat_m.load_model(str(fd / "model_cat.cbm"))
        except: cat_m = None
        try: lgb_m = lgb.Booster(model_file=str(fd / "model_lgb.txt"))
        except: lgb_m = None
        models.append({"xgb": xgb_b, "cat": cat_m, "lgb": lgb_m})
    return models

def predict_behaviors_for_pair(lab_id, video_id, aid, tid, behaviors, test_meta):
    if lab_id != "AdaptableSnail": return None
    frames, feat_df = get_frame_features_no_cache(lab_id, video_id, aid, tid, test_meta, is_test=True)
    if feat_df.empty: return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])
    
    scores = {}
    for bhv in behaviors:
        base = RESULTS_DIR / lab_id / bhv
        if not (base / "ensemble_params.json").exists(): continue
        with open(base / "ensemble_params.json") as f: p = json.load(f)
        ws, th = p["weights"], p["threshold"]
        
        folds = load_ensemble_models(lab_id, bhv)
        if not folds: continue
        
        cols = folds[0]["xgb"].feature_names
        X = pd.DataFrame(0.0, index=feat_df.index, columns=cols, dtype=np.float32)
        c = list(set(cols) & set(feat_df.columns))
        if c: X[c] = feat_df[c]
        dtest = xgb.DMatrix(X, feature_names=cols)
        
        agg = np.zeros(len(feat_df), dtype=np.float32)
        for m in folds:
            px = m["xgb"].predict(dtest)
            pc = m["cat"].predict_proba(X)[:,1] if m["cat"] else np.zeros_like(px)
            pl = m["lgb"].predict(X) if m["lgb"] else np.zeros_like(px)
            
            avg = px*ws.get("xgb", 0.33) + pc*ws.get("cat", 0.33) + pl*ws.get("lgb", 0.33)
            agg += avg * (avg >= th).astype("int8")
        
        if folds: scores[bhv] = agg / len(folds)
        
        del X, dtest
        gc.collect()

    if not scores: return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])
    
    bl = list(scores.keys()); mat = np.vstack([scores[b] for b in bl]).T
    lbls = np.where(mat.max(1)==0, "none", np.array(bl)[mat.argmax(1)])
    
    segs = []; prev = "none"; start = None; pf = None
    for f, l in zip(frames, lbls):
        if l != prev:
            if prev != "none": segs.append({"video_id": int(video_id), "action": prev, "start_frame": int(start), "stop_frame": int(pf)+1})
            prev = l; start = f
        pf = f
    if prev != "none": segs.append({"video_id": int(video_id), "action": prev, "start_frame": int(start), "stop_frame": int(pf)+1})
    
    return pd.DataFrame(segs)

# =========================================================
# 5. MAIN
# =========================================================
target_lab = "AdaptableSnail"
print("\n=== START INFERENCE ===")
test_meta = pd.read_csv(INPUT_DIR / "test.csv")
test_meta = test_meta[test_meta["lab_id"] == target_lab].reset_index(drop=True)

trained = sorted([p.name for p in (RESULTS_DIR/target_lab).iterdir() if p.is_dir()])
sb, pb = [b for b in trained if b in SELF_BEHAVIORS], [b for b in trained if b in PAIR_BEHAVIORS]

all_segs = []
def fid(i): return str(i) if str(i).startswith("mouse") else f"mouse{i}"

for vid in sorted(test_meta["video_id"].unique()):
    print(f"Predicting Video {vid}...")
    tr = load_tracking(target_lab, vid, is_test=True)
    mids = sorted(tr["mouse_id"].unique())
    
    if sb:
        for m in mids:
            df = predict_behaviors_for_pair(target_lab, vid, m, m, sb, test_meta)
            if df is not None and not df.empty:
                df["agent_id"] = fid(m); df["target_id"] = "self"
                all_segs.append(df)
    if pb and len(mids) > 1:
        for a, t in itertools.permutations(mids, 2):
            df = predict_behaviors_for_pair(target_lab, vid, a, t, pb, test_meta)
            if df is not None and not df.empty:
                df["agent_id"] = fid(a); df["target_id"] = fid(t)
                all_segs.append(df)
    del tr
    gc.collect()

cols = ["video_id", "agent_id", "target_id", "action", "start_frame", "stop_frame"]

if all_segs:
    sub1 = pd.concat(all_segs, ignore_index=True)
    sub1 = sub1[cols].sort_values(["video_id", "agent_id", "target_id", "action", "start_frame"]).reset_index(drop=True)    
    sub1.insert(0, "row_id", np.arange(len(sub1), dtype=np.int64))
else:
    sub1 = pd.DataFrame(columns=["row_id"] + cols)

sub1.to_csv(WORKING_DIR / "submission1.csv", index=False)
print(f"\nDone! Saved submission to {WORKING_DIR / 'submission1.csv'}")



=== START INFERENCE ===
Predicting Video 438887472...

Done! Saved submission to /kaggle/working/submission1.csv


# BoisterParrot

In [2]:
from pathlib import Path
import shutil
import gc

WORKING_DIR = Path("/kaggle/working")

# 1) Xóa mọi thứ trong /kaggle/working trừ .csv
for path in WORKING_DIR.iterdir():
    # giữ lại file .csv
    if path.is_file() and path.suffix == ".csv":
        continue

    if path.is_file():
        try:
            path.unlink()
        except Exception as e:
            print(f"Cannot remove file {path}: {e}")
    elif path.is_dir():
        try:
            shutil.rmtree(path, ignore_errors=True)
        except Exception as e:
            print(f"Cannot remove dir {path}: {e}")


gc.collect()

0

In [3]:
from __future__ import annotations
from typing import Dict, List, Tuple, Any, Optional
import warnings
from dataclasses import dataclass, field
import numpy as np
import pandas as pd
from scipy.ndimage import gaussian_filter1d
from tqdm import tqdm

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
np.seterr(invalid="ignore", divide="ignore")

# =============================================================================
# 1. CONFIGURATION
# =============================================================================
@dataclass
class FeatureConfig:
    """
    Chứa cấu hình tham số (Hyperparameters).
    """
    fps: float = 30.0
    pix_per_cm: float = 1.0
    smooth_sigma: float = 1.0
    use_pairwise: bool = True


# =============================================================================
# 2. AGENT CONTEXT
# =============================================================================
@dataclass
class AgentContext:
    """
    Container chứa dữ liệu đã tiền xử lý của một con chuột.
    Giúp tránh việc tính toán lại vận tốc/gia tốc nhiều lần.
    """
    idx: pd.Index          # Index frame
    pos: np.ndarray        # [F, 2] cm
    vel: np.ndarray        # [F, 2] cm/s
    speed: np.ndarray      # [F, 1] cm/s
    acc: np.ndarray        # [F, 2] cm/s^2
    
    cx: pd.Series          # Series tọa độ X (để dùng rolling)
    cy: pd.Series          # Series tọa độ Y
    speed_series: pd.Series # Series tốc độ
    
    raw_df: Optional[pd.DataFrame] = None # Dữ liệu gốc các bộ phận 


# =============================================================================
# 3. FEATURE EXTRACTOR
# =============================================================================
class FeatureExtractor:
    """
    Class trích xuất đặc trưng hành vi từ dữ liệu tracking.
    """
    def __init__(self, fps: float, pix_per_cm: float, smooth_sigma: float = 1.0, use_pairwise: bool = True):
        # Map tham số từ init vào Config
        self.cfg = FeatureConfig(
            fps=float(fps), 
            pix_per_cm=float(pix_per_cm), 
            smooth_sigma=smooth_sigma,
            use_pairwise=use_pairwise
        )
        
        # Đăng ký các hàm feature sẽ chạy
        self.feature_registry = {
            "kinematics": self._feat_basic_kinematics,
            "multiscale": self._feat_multiscale,
            "long_range": self._feat_long_range,
            "cumulative": self._feat_cumulative,
            "curvature": self._feat_curvature,
            "speed_asym": self._feat_speed_asym,
            "gauss_shift": self._feat_gauss_shift,
            "avoid": self._feat_avoidance_trajectory,
            "pose": self._feat_pose_shape,
            "a": self._feat_follow_pattern,
            "b": self._feat_shortburst_social,
            "pairwise": self._feat_pairwise
        }

    # --- Helpers ---
    def _scale(self, n_frames_30fps: int) -> int:
        """Quy đổi số frame từ chuẩn 30fps sang fps thực tế của video."""
        return max(1, int(round(n_frames_30fps * self.cfg.fps / 30.0)))

    def _to_cm(self, arr):
        """Chuyển pixel -> cm."""
        return arr / self.cfg.pix_per_cm

    def _smooth(self, x):
        """Làm mượt dữ liệu bằng Gaussian filter."""
        if self.cfg.smooth_sigma is None or x.shape[0] < 3: return x
        if np.all(np.isnan(x)): return x
        return gaussian_filter1d(x, sigma=self.cfg.smooth_sigma, axis=0, mode="nearest")

    def _forward_fill_nan(self, pos):
        """
        Điền dữ liệu thiếu (NaN) bằng giá trị hợp lệ trước đó (Forward Fill).
        """
        if np.all(np.isnan(pos)):
            return np.zeros_like(pos)

        pos_ffill = pos.copy()
        mask = np.any(~np.isnan(pos_ffill), axis=1)
        if not mask.any():
            return np.zeros_like(pos_ffill)

        valid_idx = np.where(mask)[0]
        first, last = valid_idx[0], valid_idx[-1]
        pos_ffill[:first] = pos_ffill[first]
        pos_ffill[last + 1:] = pos_ffill[last]
        df_temp = pd.DataFrame(pos_ffill)
        df_temp = df_temp.ffill()
        return df_temp.to_numpy()
    
    def _speed_series(self, cx: pd.Series, cy: pd.Series) -> pd.Series:
        dx = cx.diff()
        dy = cy.diff()
        v = np.hypot(dx, dy).fillna(0.0) * self.cfg.fps
        return v.astype("float32")
    
    def _roll_future_mean(self, s: pd.Series, w: int, min_p: int = 1) -> pd.Series:
        return s.iloc[::-1].rolling(w, min_periods=min_p).mean().iloc[::-1]

    def _roll_future_var(self, s: pd.Series, w: int, min_p: int = 2) -> pd.Series:
        return s.iloc[::-1].rolling(w, min_periods=min_p).var().iloc[::-1]

    # --- Core Logic ---
    def _compute_kinematics(self, pos_px: np.ndarray):
        """
        Tính toán vật lý cơ bản: Pos(cm), Vel, Speed, Acc.
        Input: Array [Frames, 2] (pixel).
        Output: Tuple (pos_cm, vel, speed, acc).
        """
        pos_ffill = self._forward_fill_nan(pos_px)
        pos_cm = self._to_cm(pos_ffill.astype(np.float32))
        pos_cm = self._smooth(pos_cm)                                               # [F, 2]

        dt = 1.0 / self.cfg.fps
        vel = np.zeros_like(pos_cm, dtype=np.float32)
        vel[1:] = (pos_cm[1:] - pos_cm[:-1]) / dt                                   # [F, 2: (vx, vy)]
        speed = np.linalg.norm(vel, axis=1, keepdims=True).astype(np.float32)       # [F, 1]

        acc = np.zeros_like(pos_cm, dtype=np.float32)                          
        acc[1:] = (vel[1:] - vel[:-1]) / dt                                         # [F, 2:(ax, ay)]
        return pos_cm.astype(np.float32), vel, speed, acc

    def _build_context(self, frames, pos_px, mouse_df=None) -> AgentContext:
        """
        Tạo AgentContext chứa đầy đủ thông tin vật lý của 1 con chuột.
        """
        p, v, s, a = self._compute_kinematics(pos_px)
        idx = pd.Index(frames, name="frame")
        
        return AgentContext(
            idx=idx, pos=p, vel=v, speed=s, acc=a, 
            cx=pd.Series(p[:, 0], index=idx), 
            cy=pd.Series(p[:, 1], index=idx), 
            speed_series=pd.Series(s[:, 0], index=idx), 
            raw_df=mouse_df
        )

    # --- Feature Modules ---
    def _feat_basic_kinematics(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Lấy các giá trị thô: tọa độ x, y, vận tốc vx, vy, tốc độ, gia tốc ax, ay.
        """
        return {
            "a_x": ctx.pos[:, 0], "a_y": ctx.pos[:, 1],
            "a_vx": ctx.vel[:, 0], "a_vy": ctx.vel[:, 1],
            "a_speed": ctx.speed[:, 0],
            "a_ax": ctx.acc[:, 0], "a_ay": ctx.acc[:, 1]
        }

    def _feat_multiscale(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Tính tốc độ trung bình (Mean) và độ lệch chuẩn (Std) ở đa mức thời gian.
        Feature 'sp_ratio' đo độ bùng nổ (Burstiness).
        """
        feats = {}
        speed = ctx.speed_series
        frame_scales = [10, 40, 160]
        for scale in frame_scales:
            ws = self._scale(scale)
            if len(speed) >= ws:
                roller = speed.rolling(ws, min_periods=max(1, ws//4), center=True)
                feats[f"sp_m{scale}"] = roller.mean().astype("float32")
                feats[f"sp_s{scale}"] = roller.std().astype("float32")
        feats[f"sp_ratio"] = feats["sp_m10"] / (feats["sp_m160"] + 1e-6)
        return feats 

    
        
    def _feat_long_range(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Đặc trưng ngữ cảnh dài hạn:
        - x_ml, y_ml: Vị trí trung bình trong quá khứ.
        - sp_pct: Xếp hạng (percentile) của tốc độ hiện tại so với quá khứ.
        """
        feats: Dict[str, pd.Series] = {}
        speed = ctx.speed_series

        for window in [120, 240]:
            ws = self._scale(window)
            if len(ctx.cx) >= ws:
                feats[f"x_ml{window}"] = ctx.cx.rolling(ws, min_periods=max(5, ws // 6), center=True).mean()
                feats[f"y_ml{window}"] = ctx.cy.rolling(ws, min_periods=max(5, ws // 6), center=True).mean()

        for span in [60, 120]:
            s = self._scale(span)
            feats[f"x_e{span}"] = ctx.cx.ewm(span=s, min_periods=1).mean()
            feats[f"y_e{span}"] = ctx.cy.ewm(span=s, min_periods=1).mean()

        for window in [60, 120]:
            ws = self._scale(window)
            if len(speed) >= ws:
                feats[f"sp_pct{window}"] = speed.rolling(
                    ws, min_periods=max(5, ws // 6), center=True
                ).rank(pct=True)
        return feats
    

    def _feat_curvature(self, ctx: AgentContext, **kwargs) -> Dict:
        feats = {}

        vel_x, vel_y = ctx.vel[:, 0], ctx.vel[:, 1]
        acc_x, acc_y = ctx.acc[:, 0], ctx.acc[:, 1]
        cross_prod = vel_x * acc_y - vel_y * acc_x
        vel_mag = np.sqrt(vel_x**2 + vel_y**2)
        moving_mask = vel_mag > 2.0
        vel_mag_safe = np.maximum(vel_mag, 0.1 / self.cfg.fps)
        raw_curv = cross_prod / (vel_mag_safe**3)
        raw_curv = np.where(moving_mask, raw_curv, 0.0)
        min_turn_radius_cm = 0.5
        max_k = 1.0 / min_turn_radius_cm
        raw_curv = np.clip(raw_curv, -max_k, max_k)
        abs_curv = np.abs(raw_curv)
        abs_curv_series = pd.Series(abs_curv, index=ctx.idx)

        for w in [30, 60]:
            ws = self._scale(w)
            min_p = max(ws // 3, 1)
            feats[f"curv_mean_{w}"] = abs_curv_series.rolling(ws, min_periods=min_p).mean()

        angle = np.arctan2(vel_y, vel_x)
        angle_series = pd.Series(angle, index=ctx.idx)
        angle_change = np.abs(angle_series.diff().fillna(0.0))
        angle_change = np.where(angle_change > np.pi, 2 * np.pi - angle_change, angle_change)
        angle_change_series = pd.Series(angle_change, index=ctx.idx)
        angle_change_series = pd.Series(np.where(moving_mask, angle_change_series, 0.0), index=ctx.idx)

        ws = self._scale(30)
        feats["turn_rate_30"] = angle_change_series.rolling(ws, min_periods=max(ws // 3, 1)).sum()

        return feats
    
    def _feat_cumulative(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Tổng quãng đường di chuyển trong một khoảng thời gian dài xung quanh frame hiện tại.
        """
        feats = {}
        L = max(1, self._scale(180))
        step = np.hypot(ctx.cx.diff(), ctx.cy.diff()).fillna(0.0)
        path = step.rolling(2 * L + 1, min_periods=max(5, L // 6), center=True).sum()
        feats["path_cum180"] =  path.fillna(0.0).astype("float32")
        return feats

    def _feat_speed_asym(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Bất đối xứng tốc độ (Tương lai - Quá khứ).
        """
        w = max(3, self._scale(30))
        v = ctx.speed_series
        v_past = v.rolling(w, min_periods=1).mean()
        v_fut = self._roll_future_mean(v, w, min_p=1)
        return {"spd_asym_1s": (v_fut - v_past).fillna(0.0)}
    
    def _feat_gauss_shift(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Độ lệch Gaussian (KL Divergence) giữa quá khứ và tương lai.
        Đo lường sự thay đổi trạng thái thống kê.
        """
        w = max(5, self._scale(30))
        v = ctx.speed_series
        mu_p = v.rolling(w, min_periods=1).mean()
        va_p = v.rolling(w, min_periods=1).var().clip(lower=1e-6)
        mu_f = self._roll_future_mean(v, w, min_p=1)
        va_f = self._roll_future_var(v, w, min_p=1).clip(lower=1e-6)

        kl_pf = 0.5 * (
            (va_p / va_f) + ((mu_f - mu_p) ** 2) / va_f - 1.0 + np.log(va_f / va_p)
        )
        kl_fp = 0.5 * (
            (va_f / va_p) + ((mu_p - mu_f) ** 2) / va_p - 1.0 + np.log(va_p / va_f)
        )
        return {
            "spd_symkl_1s": (kl_pf + kl_fp).replace([np.inf, -np.inf], np.nan).fillna(0.0)
        }

    def _feat_avoidance_trajectory(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        """
        Tính toán quỹ đạo né tránh:
        1. Relative Heading: Góc di chuyển so với hướng tới đối thủ.
        2. Future Distance Gain: Dự báo xem hành động này có giúp chuột ra xa đối thủ trong tương lai không.
        """
        feats = {}
        if target_ctx is None: 
            return feats

        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
        rel_vec = target_ctx.pos - ctx.pos
        # Góc hướng tới địch (Angle to Target)
        angle_to_target = np.arctan2(rel_vec[:, 1], rel_vec[:, 0])
        
        # Góc di chuyển của Tôi (My Heading)
        my_heading = np.arctan2(ctx.vel[:, 1], ctx.vel[:, 0])
        
        # Độ lệch góc (Absolute Difference)
        # Cần xử lý wrap góc (ví dụ: lệch giữa 179 độ và -179 độ là 2 độ chứ ko phải 358)
        diff = np.abs(angle_to_target - my_heading)
        diff = np.minimum(diff, 2*np.pi - diff) # Chuẩn hóa về [0, pi]
        
        # Feature: Cosine của góc lệch
        # 1.0 (0 độ) -> Lao vào
        # 0.0 (90 độ) -> AVOID (Lách ngang)
        # -1.0 (180 độ) -> Escape
        feats["heading_rel_cos"] = pd.Series(np.cos(diff), index=idx, dtype="float32")
        
        # Feature: Góc lệch tuyệt đối (đổi ra độ cho dễ hình dung nếu cần, ở đây để rad)
        feats["heading_rel_abs"] = pd.Series(diff, index=idx, dtype="float32")


        # --- 2. FUTURE DISTANCE GAIN (Hiệu quả tránh né) ---
        # "Sau 15 frame (0.5s) hoặc 30 frame (1s), mình có xa nó ra không?"
        
        dist_now = np.linalg.norm(rel_vec, axis=1)
        s_dist = pd.Series(dist_now, index=idx)
        
        scales = [15, 30] # 0.5s và 1s
        for w in scales:
            ws = self._scale(w)
            
            # Lấy khoảng cách ở tương lai (shift ngược lên)
            # s.shift(-ws) là giá trị của t + ws
            dist_future = s_dist.shift(-ws)
            gain = dist_future - s_dist
            
            feats[f"dist_gain_{w}f"] = gain.fillna(0.0).astype("float32")

        return feats
    
    def _extract_part(self, ctx: AgentContext, part: str) -> Optional[np.ndarray]:
        if ctx.raw_df is None: return None
        if part not in ctx.raw_df.columns.get_level_values(0): return None
        try:
            sub_df = ctx.raw_df.xs(part, axis=1, level=0)[["x", "y"]].reindex(ctx.idx)
        except KeyError: return None
        raw = sub_df.to_numpy()
        raw = self._forward_fill_nan(raw)
        cm = self._to_cm(raw.astype(np.float32))
        return self._smooth(cm)
    
    def _extract_parts_dict(self, ctx: AgentContext, parts: List[str] = None) -> Dict[str, Optional[np.ndarray]]:
        out = {}
        for p in parts:
            out[p] = self._extract_part(ctx, p)
        return out
        
    def _feat_pose_shape(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Placeholder cho các đặc trưng hình dáng (Elongation, Body Angle...).
        """
        feats = {}

        def zero(): return pd.Series(0.0, index=ctx.idx, dtype="float32")

        def dist(k1, k2):
            p1, p2 = parts.get(k1), parts.get(k2)
            if p1 is None or p2 is None: return zero()
            d = np.linalg.norm(p1 - p2, axis=1)
            return pd.Series(d, index=ctx.idx, dtype="float32")
        
        def body_angle():
            if parts.get("nose") is None: return zero()
            if parts.get("body_center") is None: return zero()
            if parts.get("tail_base") is None: return zero()

            v1 = parts.get("nose") - parts.get("body_center")
            v2 = parts.get("tail_base") - parts.get("body_center")
            dot_product = np.sum(v1 * v2, axis=1)
            mag = np.linalg.norm(v1, axis=1) * np.linalg.norm(v2, axis=1)
            cos_angle = np.clip(dot_product / (mag + 1e-6), -1.0, 1.0).astype("float32")
            return cos_angle
        
        # def elongation():
        #     if parts.get("nose")          is None: return zero()
        #     if parts.get("tail_base")     is None: return zero()
        #     if parts.get("lateral_left")  is None: return zero()
        #     if parts.get("lateral_right") is None: return zero()

        #     d1 = dist("nose", "tail_base")
        #     d2 = dist("lateral_left", "lateral_right")
        #     elongation = d1 / (d2 + 1e-6).astype("float32")
        #     return elongation

        
        
        def vel(part: str, n_frames_30fps: int) -> Dict:
            part_pos = self._extract_part(ctx, part)
            if part_pos is None: return zero()
            
            s_x = pd.Series(part_pos[:, 0], index=ctx.idx)
            s_y = pd.Series(part_pos[:, 1], index=ctx.idx)
            raw_speed = self._speed_series(s_x, s_y)

            ws = self._scale(n_frames_30fps)
            val = raw_speed.rolling(ws, min_periods=1, center=True).mean()
            return val.astype("float32")


        target_parts = ["nose", "body_center", "tail_base", 
                        "ear_left", "ear_right"]
        
        parts = self._extract_parts_dict(ctx, target_parts)

        # feats["a_body_width"]                = dist("lateral_left", "lateral_right")
        # feats["aa_nose_bodycenter_dist"]     = dist("nose", "body_center")
        # feats["aa_nose_tailbase_dist"]       = dist("nose", "tail_base")
        # feats["aa_bodycenter_tailbase_dist"] = dist("body_center", "tail_base")
        
        # feats["aa_bodycenter_ear_l_dist"]    = dist("body_center", "ear_left")
        # feats["aa_bodycenter_ear_r_dist"]    = dist("body_center", "ear_right")
        # feats["aa_bodycenter_lateral_l_dist"]= dist("body_center", "lateral_left")
        # feats["aa_bodycenter_lateral_r_dist"]= dist("body_center", "lateral_right")
        
        feats["a_body_angle"]                = body_angle()
        # feats["a_elongation"]                = elongation()
        feats["a_tail_base_vel_500ms"]       = vel("tail_base", 15)
        feats["a_tail_base_vel_1000ms"]      = vel("tail_base", 30)
        feats["a_tail_base_vel_2000ms"]      = vel("tail_base", 60)
        feats["a_tail_base_vel_3000ms"]      = vel("tail_base", 90)
        feats["a_nose_vel_500ms"]            = vel("nose", 15)
        feats["a_nose_vel_1000ms"]           = vel("nose", 30)
        feats["a_nose_vel_2000ms"]           = vel("nose", 60)
        feats["a_nose_vel_3000ms"]           = vel("nose", 90)
        feats["a_ear_right_vel_500ms"]       = vel("ear_right", 15)
        feats["a_ear_right_vel_1000ms"]      = vel("ear_right", 30)
        feats["a_ear_right_vel_2000ms"]      = vel("ear_right", 60)
        feats["a_ear_right_vel_3000ms"]      = vel("ear_right", 90)

        return feats

    def _feat_shortburst_social(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        """
        Short-burst social features (10–30 frames) đặc biệt cho attack / chase / escape.
        Chỉ dùng được khi có target_ctx.
        """
        feats = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- Lấy lại vài quantity cơ bản từ pairwise/avoidance ---
        # vector Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        rel_dist = np.linalg.norm(rel_vec, axis=1)
        rel_dist_s = pd.Series(rel_dist, index=idx, dtype="float32")
    
        # unit vector
        rel_dist_safe = np.where(rel_dist == 0, 1e-6, rel_dist)
        u_vec = rel_vec / rel_dist_safe[:, None]
    
        # velocity dọc trục nối (approach speed)
        a_vel = ctx.vel
        t_vel = target_ctx.vel
        a_along = np.sum(a_vel * u_vec, axis=1)                # +: lao vào target
        t_along = np.sum(t_vel * (-u_vec), axis=1)             # +: target lao vào agent
        rel_along = np.sum((a_vel - t_vel) * u_vec, axis=1)    # +: lại gần nhau
    
        a_along_s = pd.Series(a_along, index=idx, dtype="float32")
        t_along_s = pd.Series(t_along, index=idx, dtype="float32")
        rel_along_s = pd.Series(rel_along, index=idx, dtype="float32")
    
        # speed agent / target
        a_speed = ctx.speed_series
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32"
        )
    
        # heading_rel_cos ~ escape / approach
        # vector body của agent
        # (reuse idea từ _feat_pairwise)
        # head ~ nose, tail ~ tail_base/body_center
        parts_a = self._extract_parts_dict(ctx, ["nose", "tail_base", "body_center"])
        head_a = parts_a.get("nose")
        tail_a = parts_a.get("tail_base") if parts_a.get("tail_base") is not None else parts_a.get("body_center")
    
        if head_a is not None and tail_a is not None:
            body_vec_a = head_a - tail_a
            dot = np.sum(body_vec_a * rel_vec, axis=1)
            mag = np.linalg.norm(body_vec_a, axis=1) * rel_dist_safe
            heading_cos = np.clip(dot / (mag + 1e-6), -1.0, 1.0)
            heading_cos_s = pd.Series(heading_cos, index=idx, dtype="float32")
        else:
            heading_cos_s = zero()
    
        # --- Rolling window 10, 20, 30 frames (ở fps gốc) ---
        for w30 in [10, 20, 30]:
            ws = self._scale(w30)
            min_p = max(1, ws // 3)
    
            # Attack-like: approach mạnh, khoảng cách giảm nhanh
            feats[f"sb_att_approach_mean_{w30}"] = a_along_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_att_rel_along_mean_{w30}"] = rel_along_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_att_dist_delta_{w30}"] = (rel_dist_s - rel_dist_s.shift(ws)).fillna(0.0)
    
            # Chase-like: agent & target đều nhanh, dist tương đối nhỏ
            feats[f"sb_chase_speed_agent_mean_{w30}"] = a_speed.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_chase_speed_target_mean_{w30}"] = t_speed.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_chase_dist_mean_{w30}"] = rel_dist_s.rolling(ws, min_periods=min_p).mean()
    
            # Escape-like: heading ngược, dist tăng nhanh
            feats[f"sb_esc_heading_cos_mean_{w30}"] = heading_cos_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_esc_dist_gain_{w30}"] = (rel_dist_s.shift(-ws) - rel_dist_s).fillna(0.0)
    
        # clip & fillna
        for k, v in feats.items():
            feats[k] = v.replace([np.inf, -np.inf], np.nan).fillna(0.0).astype("float32")
    
        return feats


    def _feat_follow_pattern(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        """
        Đặc trưng hành vi FOLLOW:
          - Agent ở gần target
          - Cùng hướng (body + velocity)
          - Tốc độ vừa phải
          - Khoảng cách tương đối ổn định trong 0.5–1s
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- 1. CÁC ĐẠI LƯỢNG CƠ BẢN ---
        # Vector Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        rel_dist = np.linalg.norm(rel_vec, axis=1)
        rel_dist_s = pd.Series(rel_dist, index=idx, dtype="float32")
    
        # Speed agent/target
        a_speed = ctx.speed_series.astype("float32")
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32",
        )
    
        # Body vector: nose - tail/body_center
        parts_a = self._extract_parts_dict(ctx, ["nose", "tail_base", "body_center"])
        parts_t = self._extract_parts_dict(target_ctx, ["nose", "tail_base", "body_center"])
    
        def body_vec(parts_dict):
            head = parts_dict.get("nose")
            tail = parts_dict.get("tail_base")
            if tail is None:
                tail = parts_dict.get("body_center")
            if head is None or tail is None:
                return None
            return head - tail
    
        a_body = body_vec(parts_a)
        t_body = body_vec(parts_t)
    
        if a_body is not None and t_body is not None:
            dot_bt = np.sum(a_body * t_body, axis=1)
            mag_bt = np.linalg.norm(a_body, axis=1) * np.linalg.norm(t_body, axis=1)
            cos_body = np.clip(dot_bt / (mag_bt + 1e-6), -1.0, 1.0)
            cos_body_s = pd.Series(cos_body, index=idx, dtype="float32")
        else:
            cos_body_s = zero()
    
        # Velocity hướng
        a_vel = ctx.vel
        t_vel = target_ctx.vel
        a_speed_np = np.linalg.norm(a_vel, axis=1)
        t_speed_np = np.linalg.norm(t_vel, axis=1)
        moving_mask = (a_speed_np > 1e-3) & (t_speed_np > 1e-3)
    
        # cos giữa hướng velocity 2 con
        dot_v = np.sum(a_vel * t_vel, axis=1)
        mag_v = a_speed_np * t_speed_np + 1e-6
        cos_vel = np.zeros_like(dot_v, dtype="float32")
        cos_vel[moving_mask] = np.clip(dot_v[moving_mask] / mag_v[moving_mask], -1.0, 1.0)
        cos_vel_s = pd.Series(cos_vel, index=idx, dtype="float32")
    
        # --- 2. WINDOW NGẮN (FOLLOW LÀ PATTERN DÀI HƠN ATTACK) ---
        for w30 in [15, 30, 60]:   # ~0.5s, 1s, 2s
            ws = self._scale(w30)
            min_p = max(ws // 3, 1)
    
            # Khoảng cách trung bình & độ dao động
            m_dist = rel_dist_s.rolling(ws, min_periods=min_p).mean()
            s_dist = rel_dist_s.rolling(ws, min_periods=min_p).std()
    
            # Cùng hướng (body + velocity)
            m_cos_body = cos_body_s.rolling(ws, min_periods=min_p).mean()
            m_cos_vel  = cos_vel_s.rolling(ws, min_periods=min_p).mean()
    
            # Tốc độ vừa phải
            m_sp_a = a_speed.rolling(ws, min_periods=min_p).mean()
            m_sp_t = t_speed.rolling(ws, min_periods=min_p).mean()
    
            feats[f"follow_dist_mean_{w30}"] = m_dist
            feats[f"follow_dist_std_{w30}"]  = s_dist
            feats[f"follow_cos_body_mean_{w30}"] = m_cos_body
            feats[f"follow_cos_vel_mean_{w30}"]  = m_cos_vel
            feats[f"follow_speed_agent_mean_{w30}"] = m_sp_a
            feats[f"follow_speed_target_mean_{w30}"] = m_sp_t
    
        # Clean
        for k, v in feats.items():
            feats[k] = (
                v.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats
        

    def _feat_pairwise(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict:
        """
        Đặc trưng tương tác cặp đôi (Pairwise): Khoảng cách, Tốc độ tiếp cận.
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None: 
            return feats

        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")

        # --- 1. KHOẢNG CÁCH CƠ BẢN (DISTANCES) ---
        # Vector nối Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        dist = np.linalg.norm(rel_vec, axis=1)
        feats["rel_dist"] = pd.Series(dist, index=idx, dtype="float32")

        # --- 2. KHOẢNG CÁCH CHI TIẾT (NOSE-TO-PART) ---
        # Lấy các bộ phận quan trọng
        my_parts = self._extract_parts_dict(ctx, ["nose"])
        target_parts = self._extract_parts_dict(target_ctx, 
            ["nose", "tail_base", "body_center", "ear_left", "ear_right"])

        def dist_ab(pt_a, pt_b):
            if pt_a is None or pt_b is None: return zero()
            d = np.linalg.norm(pt_a - pt_b, axis=1)
            return pd.Series(d, index=idx, dtype="float32")

        an, tn = my_parts["nose"], target_parts["nose"]
        feats["dist_nose_nose"] = dist_ab(an, tn)
        feats["dist_nose_tail"] = dist_ab(an, target_parts["tail_base"])
        feats["dist_nose_body"] = dist_ab(an, target_parts["body_center"])
        feats["dist_nose_el"]   = dist_ab(an, target_parts["ear_left"])
        feats["dist_nose_er"]   = dist_ab(an, target_parts["ear_right"])
        # feats["dist_nose_tll"]  = dist_ab(an, target_parts["lateral_left"])
        # feats["dist_nose_tlr"]  = dist_ab(an, target_parts["lateral_right"])
        # feats["dist_nose_tt"]  = dist_ab(an, target_parts["tail_tip"])

        # --- 3. ĐỊNH HƯỚNG & GÓC NHÌN (ORIENTATION & GAZE) ---
        # Helper lấy vector cơ thể (Mũi - Đuôi/Thân)
        def get_body_vec(parts_dict):
            head = parts_dict.get("nose")
            # Ưu tiên đuôi, nếu ko có thì dùng thân
            tail = parts_dict.get("tail_base")
            if tail is None: tail = parts_dict.get("body_center") # Fallback
            
            if head is not None and tail is not None:
                return head - tail
            return None

        a_vec = get_body_vec(my_parts)
        t_vec = get_body_vec(target_parts)

        # A. Body Cosine: Hai con cùng chiều hay ngược chiều?
        if a_vec is not None and t_vec is not None:
            dot = np.sum(a_vec * t_vec, axis=1)
            mags = np.linalg.norm(a_vec, axis=1) * np.linalg.norm(t_vec, axis=1)
            feats["body_cosine"] = pd.Series(
                np.clip(dot / (mags + 1e-6), -1.0, 1.0), index=idx, dtype="float32"
            )
        else:
            feats["body_cosine"] = zero()

        # B. Gaze Cosine: Tôi có đang nhìn về phía Target không?
        # Vector ánh nhìn = Target_Pos - My_Pos = rel_vec
        if a_vec is not None:
            dot_gaze = np.sum(a_vec * rel_vec, axis=1)
            mag_a = np.linalg.norm(a_vec, axis=1)
            # dist đã tính ở bước 1
            feats["gaze_cosine"] = pd.Series(
                np.clip(dot_gaze / (mag_a * dist + 1e-6), -1.0, 1.0),
                index=idx, dtype="float32"
            )
        else:
            feats["gaze_cosine"] = zero()

        # --- 4. PHÂN RÃ VẬN TỐC (VELOCITY DECOMPOSITION) - CHÌA KHÓA CHO AVOID/ESCAPE ---
        # Vector đơn vị hướng về địch (u)
        dist_safe = dist.copy()
        dist_safe[dist_safe == 0] = 1e-6
        u_vec = rel_vec / dist_safe[:, None]

        # a_vel và t_vel lấy từ Context
        a_vel, t_vel = ctx.vel, target_ctx.vel

        # A. Approach Speed (Vận tốc dọc trục nối 2 con)
        # Dương: Lao vào nhau | Âm: Chạy ra xa nhau
        a_along = np.sum(a_vel * u_vec, axis=1)
        t_along = np.sum(t_vel * (-u_vec), axis=1) # Target hướng ngược lại
        rel_along = np.sum((a_vel - t_vel) * u_vec, axis=1)

        # B. Lateral Speed (Vận tốc ngang - Vuông góc trục nối)
        # Vector chiếu: v_proj = (v . u) * u
        a_proj = a_along[:, None] * u_vec
        a_lat_vec = a_vel - a_proj
        a_lat_speed = np.linalg.norm(a_lat_vec, axis=1)

        feats["approach_speed_agent"]  = pd.Series(a_along, index=idx, dtype="float32")
        feats["approach_speed_target"] = pd.Series(t_along, index=idx, dtype="float32")
        feats["approach_speed_rel"]    = pd.Series(rel_along, index=idx, dtype="float32")
        feats["lateral_speed_agent"]   = pd.Series(a_lat_speed, index=idx, dtype="float32")
        return feats
        

    # --- Methods tương thích ---
    
    def build_pose_tensor(self, tracking: pd.DataFrame):
        """
        Chuyển dữ liệu tracking (DataFrame) sang Tensor [Frames, Mice, 2] và Dict chi tiết.
        """
        tracking = tracking.sort_values("video_frame")
        frames = np.sort(tracking["video_frame"].unique())
        
        pvid = tracking.pivot(
            index="video_frame", 
            columns=["mouse_id", "bodypart"], 
            values=["x", "y"]
        )
        pvid = pvid.reorder_levels([1, 2, 0], axis=1).sort_index(axis=1).astype("float32")
        mouse_ids = list(pvid.columns.get_level_values(0).unique())
        pos = np.full((len(frames), len(mouse_ids), 2), np.nan, dtype=np.float32)
        per_mouse_df = {}
        
        for i, mid in enumerate(mouse_ids):
            single = pvid[mid]
            per_mouse_df[mid] = single
            
            if "body_center" in single.columns.get_level_values(0):
                cx = single["body_center"]["x"]
                cy = single["body_center"]["y"]
            else:
                cx = single.xs("x", level=1, axis=1).mean(axis=1)
                cy = single.xs("y", level=1, axis=1).mean(axis=1)
            
            pos[:, i, 0] = cx.reindex(frames).values
            pos[:, i, 1] = cy.reindex(frames).values
            
        return frames, mouse_ids, pos, per_mouse_df

    def extract_agent_target(
        self, 
        frames: np.ndarray, 
        mouse_ids: List[Any], 
        pos: np.ndarray, 
        agent_id: Any, 
        target_id: Any, 
        per_mouse_df: Dict = None
    ) -> pd.DataFrame:
        """
        Trích xuất đặc trưng cho cặp (Agent, Target).
        """
        try:
            aid_idx = mouse_ids.index(agent_id)
        except ValueError:
            return pd.DataFrame() 

        # 1. Build Agent Context
        ctx_agent = self._build_context(
            frames, 
            pos[:, aid_idx, :], 
            per_mouse_df.get(agent_id) if per_mouse_df else None
        )

        # 2. Build Target Context
        ctx_target = None
        if self.cfg.use_pairwise and target_id is not None and target_id in mouse_ids:
             tid_idx = mouse_ids.index(target_id)
             ctx_target = self._build_context(
                 frames, 
                 pos[:, tid_idx, :], 
                 per_mouse_df.get(target_id) if per_mouse_df else None
             )

        # 3. Run all features
        all_data = {}
        for func_name, func in self.feature_registry.items():
            out_dict = func(ctx_agent, target_ctx=ctx_target)
            all_data.update(out_dict)

        df_out = pd.DataFrame(all_data, index=ctx_agent.idx)
        df_out = df_out.replace([np.inf, -np.inf], np.nan).fillna(0.0)
        
        return df_out.reindex(sorted(df_out.columns), axis=1)


from __future__ import annotations
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import gc
import itertools
import json
import time
import warnings
import joblib
from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedGroupKFold

# === IMPORT MODEL & OPTUNA ===
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
import optuna

# Cấu hình
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings('ignore', category=UserWarning)
np.seterr(invalid="ignore", divide="ignore")

# Metric
import sys
sys.path.append("/kaggle/usr/lib/mabe-f-beta")
try:
    from metric import score
except ImportError:
    def score(*args, **kwargs): return 0.0

# =========================================================
# 1. CẤU HÌNH & SEED
# =========================================================
SEED = 42
def seed_everything(seed=42):
    np.random.seed(seed)
seed_everything(SEED)

INPUT_DIR = Path("/kaggle/input/MABe-mouse-behavior-detection")
TRAIN_TRACKING_DIR = INPUT_DIR / "train_tracking"
TRAIN_ANNOTATION_DIR = INPUT_DIR / "train_annotation"
TEST_TRACKING_DIR = INPUT_DIR / "test_tracking"

WORKING_DIR = Path("/kaggle/working")
RESULTS_DIR = Path(r"/kaggle/input/results-ensemble-optuna1")
RESULTS_DIR.mkdir(exist_ok=True, parents=True)

SELF_BEHAVIORS = ["biteobject", "climb", "dig", "exploreobject", "freeze", "genitalgroom", "huddle", "rear", "rest", "run", "selfgroom"]
PAIR_BEHAVIORS = ["allogroom", "approach", "attack", "attemptmount", "avoid", "chase", "chaseattack", "defend", "disengage", "dominance", "dominancegroom", "dominancemount", "ejaculate", "escape", "flinch", "follow", "intromit", "mount", "reciprocalsniff", "shepherd", "sniff", "sniffbody", "sniffface", "sniffgenital", "submit", "tussle"]
BAD_VIDEOS = []

# =========================================================
# 2. DATA LOADING & PREPARATION (NO CACHE)
# =========================================================

def load_metadata() -> pd.DataFrame:
    return pd.read_csv(INPUT_DIR / "train.csv")

def get_video_params(video_id: Any, meta: pd.DataFrame) -> Tuple[float, float]:
    row = meta.loc[meta["video_id"] == video_id]
    if row.empty: return 30.0, 1.0
    row = row.iloc[0]
    return float(row["frames_per_second"]), float(row["pix_per_cm_approx"])

def load_tracking(lab_id: str, video_id: Any, is_test=False) -> pd.DataFrame:
    d = TEST_TRACKING_DIR if is_test else TRAIN_TRACKING_DIR
    path = d / str(lab_id) / f"{video_id}.parquet"
    if not path.exists(): raise FileNotFoundError(path)
    return pd.read_parquet(path)

def load_annotation(lab_id: str, video_id: Any) -> pd.DataFrame:
    path = TRAIN_ANNOTATION_DIR / str(lab_id) / f"{video_id}.parquet"
    if not path.exists(): return pd.DataFrame(columns=["agent_id", "target_id", "action", "start_frame", "stop_frame"])
    return pd.read_parquet(path)[["agent_id", "target_id", "action", "start_frame", "stop_frame"]]

# Hàm lấy feature KHÔNG CACHE để tránh tràn RAM
def get_frame_features_no_cache(lab_id, video_id, agent_id, target_id, meta, is_test=False):
    if is_test:
        row = meta[meta["video_id"] == video_id].iloc[0]
        fps, pix = float(row["frames_per_second"]), float(row["pix_per_cm_approx"])
        pix = pix if np.isfinite(pix) and pix > 0 else 1.0
    else:
        fps, pix = get_video_params(video_id, meta)

    tracking = load_tracking(lab_id, video_id, is_test)
    
    # === GỌI CLASS FeatureExtractor (Đã có ở cell trước) ===
    fe = FeatureExtractor(fps=fps, pix_per_cm=pix, smooth_sigma=1.0, use_pairwise=True)
    
    frames, mouse_ids, pos, per_mouse_df = fe.build_pose_tensor(tracking)
    
    features_df = fe.extract_agent_target(
        frames=frames, mouse_ids=mouse_ids, pos=pos,
        agent_id=agent_id, target_id=target_id, per_mouse_df=per_mouse_df
    )
    features_df.index = frames
    return frames, features_df

def build_frame_dataset_for_lab_behavior(lab_id, behavior, train_meta, mode="self"):
    videos = train_meta[train_meta["lab_id"] == lab_id]["video_id"].unique().tolist()
    index_list, feature_list, label_list = [], [], []

    for video_id in videos:
        ann = load_annotation(lab_id, video_id)
        if ann.empty: continue
        
        ann_bhv = ann[ann["action"] == behavior]
        if ann_bhv.empty: continue

        pairs = ann_bhv[["agent_id", "target_id"]].drop_duplicates().values.tolist()
        for (agent_id, target_id) in pairs:
            target_id_use = agent_id if mode == "self" else target_id
            
            # Lấy features (tính trực tiếp)
            frames, feat_df = get_frame_features_no_cache(lab_id, video_id, agent_id, target_id_use, train_meta)

            ann_pair = ann_bhv[(ann_bhv["agent_id"] == agent_id) & (ann_bhv["target_id"] == target_id)]
            if ann_pair.empty and mode == "self": ann_pair = ann_bhv[ann_bhv["agent_id"] == agent_id]

            pos_frames = set()
            for _, r in ann_pair.iterrows(): pos_frames.update(range(int(r["start_frame"]), int(r["stop_frame"])))
            
            if not pos_frames: continue
            label = np.isin(frames, list(pos_frames)).astype("int8")
            if label.sum() == 0: continue

            # Lưu vào list và reset index ngay để giảm memory overhead
            index_list.append(pd.DataFrame({"video_id": video_id, "agent_id": agent_id, "target_id": target_id, "video_frame": frames}))
            feature_list.append(feat_df.reset_index(drop=True))
            label_list.append(label)
            
            # Dọn dẹp ngay
            del frames, feat_df, label

    if not index_list: return pd.DataFrame(), pd.DataFrame(), np.zeros(0, dtype="int8")
    
    return pd.concat(index_list, ignore_index=True), pd.concat(feature_list, ignore_index=True), np.concatenate(label_list).astype("int8")

# =========================================================
# 3. TRAINING & ENSEMBLE HELPERS
# =========================================================

def train_catboost_fold(X_tr, y_tr, X_va, y_va, sw=1.0):
    p = {
        'iterations': 1000, 'learning_rate': 0.05, 'depth': 6, 'scale_pos_weight': sw,
        'task_type': 'GPU', 'devices': '0', 'verbose': 0, 'allow_writing_files': False,
        'l2_leaf_reg': 5, 'bootstrap_type': 'Bernoulli', 'subsample': 0.8, 'random_seed': SEED
    }
    m = cb.CatBoostClassifier(**p)
    m.fit(cb.Pool(X_tr, y_tr), eval_set=cb.Pool(X_va, y_va), early_stopping_rounds=20, use_best_model=True)
    return m

def train_lightgbm_fold(X_tr, y_tr, X_va, y_va, sw=1.0):
    p = {
        'objective': 'binary', 'metric': 'binary_logloss', 'learning_rate': 0.05,
        'max_depth': 6, 'num_leaves': 31, 'scale_pos_weight': sw, 'device': 'gpu',
        'verbosity': -1, 'min_child_weight': 5, 'subsample': 0.8, 'colsample_bytree': 0.8,
        'subsample_freq': 1, 'seed': SEED
    }
    m = lgb.train(p, lgb.Dataset(X_tr, y_tr), 1000, valid_sets=[lgb.Dataset(X_va, y_va)], callbacks=[lgb.early_stopping(20, verbose=False)])
    return m

def optimize_ensemble_weights(oof_dict, y_true):
    models = list(oof_dict.keys())
    def obj(trial):
        w = [trial.suggest_float(m, 0.0, 1.0) for m in models]
        s = sum(w) + 1e-6; w = [x/s for x in w]
        p = np.zeros_like(y_true, dtype=float)
        for i, m in enumerate(models): p += oof_dict[m] * w[i]
        th = trial.suggest_float("th", 0.1, 0.9)
        return f1_score(y_true, (p >= th).astype(int), zero_division=0)
    
    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=SEED))
    study.optimize(obj, n_trials=50)
    best = study.best_params
    th = best.pop("th")
    rw = [best[m] for m in models]; s = sum(rw)+1e-6
    return {m: w/s for m, w in zip(models, rw)}, th

def train_validate_ensemble(lab_id, behavior, indices, features, labels):
    res_dir = RESULTS_DIR / lab_id / behavior
    res_dir.mkdir(parents=True, exist_ok=True)

    if len(labels) == 0 or labels.sum() == 0: return 0.0

    X = features.values.astype("float32")
    y = labels.astype("int8")
    groups = indices["video_id"].values
    
    oof_preds = {m: np.zeros(len(y), dtype="float32") for m in ["xgb", "cat", "lgb"]}
    folds = np.ones(len(y), dtype="int8") * -1

    cv = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=SEED)
    for fold, (tr_idx, va_idx) in enumerate(cv.split(X, y, groups=groups)):
        print(f"   Fold {fold}...", end=" ")
        fd_dir = res_dir / f"fold_{fold}"; fd_dir.mkdir(parents=True, exist_ok=True)
        X_tr, y_tr = X[tr_idx], y[tr_idx]; X_va, y_va = X[va_idx], y[va_idx]
        pos = y_tr.sum(); neg = len(y_tr) - pos
        sw = float(neg/pos) if pos > 0 else 1.0

        # 1. XGBoost
        dtr = xgb.QuantileDMatrix(X_tr, label=y_tr, feature_names=features.columns.tolist(), max_bin=64)
        dva = xgb.DMatrix(X_va, label=y_va, feature_names=features.columns.tolist())
        xp = {
            "objective":"binary:logistic", "eval_metric":"logloss", "device":"cuda", 
            "tree_method":"hist", "learning_rate":0.05, "max_depth":6, "scale_pos_weight":sw,
            "min_child_weight":5, "subsample":0.8, "colsample_bytree":0.8, "max_bin":64, "seed": SEED
        }
        
        # === ĐÃ THÊM 'evals=' VÀO DÒNG DƯỚI ===
        mx = xgb.train(
            params=xp, 
            dtrain=dtr, 
            num_boost_round=1000, 
            evals=[(dva, "valid")],
            callbacks=[xgb.callback.EarlyStopping(rounds=20, save_best=True)], 
            verbose_eval=False
        )
        mx.save_model(fd_dir / "model_xgb.json")
        oof_preds["xgb"][va_idx] = mx.predict(dva)

        # 2. CatBoost
        mc = train_catboost_fold(X_tr, y_tr, X_va, y_va, sw)
        mc.save_model(str(fd_dir / "model_cat.cbm"))
        oof_preds["cat"][va_idx] = mc.predict_proba(X_va)[:,1]

        # 3. LightGBM
        ml = train_lightgbm_fold(X_tr, y_tr, X_va, y_va, sw)
        ml.save_model(fd_dir / "model_lgb.txt")
        oof_preds["lgb"][va_idx] = ml.predict(X_va)
        folds[va_idx] = fold
        
        print("Done.")
        del X_tr, y_tr, X_va, y_va, dtr, dva, mx, mc, ml
        gc.collect()

    print("   Optimizing Weights...", end=" ")
    weights, th = optimize_ensemble_weights(oof_preds, y)
    with open(res_dir / "ensemble_params.json", "w") as f: json.dump({"weights": weights, "threshold": th}, f)
    
    final_pred = sum(oof_preds[m] * weights[m] for m in weights)
    final_lbl = (final_pred >= th).astype("int8")
    
    # Save OOF
    df = indices.copy(); df["fold"] = folds; df["pred"] = final_pred; df["lbl"] = final_lbl
    df.to_parquet(res_dir / "oof.parquet", index=False)
    
    f1 = f1_score(y, final_lbl, zero_division=0)
    print(f"Best F1: {f1:.4f} (Th={th:.2f}, W={weights})")
    (res_dir / "f1.txt").write_text(f"{f1:.6f}")
    return float(f1)

# =========================================================
# 4. INFERENCE
# =========================================================

def load_ensemble_models(lab_id, behavior):
    base = RESULTS_DIR / lab_id / behavior
    if not base.exists(): return []
    models = []
    for fd in sorted(base.glob("fold_*")):
        if not (fd / "model_xgb.json").exists(): continue
        
        xgb_b = xgb.Booster(); xgb_b.load_model(str(fd / "model_xgb.json"))
        cat_m = cb.CatBoostClassifier(); 
        try: cat_m.load_model(str(fd / "model_cat.cbm"))
        except: cat_m = None
        try: lgb_m = lgb.Booster(model_file=str(fd / "model_lgb.txt"))
        except: lgb_m = None
        models.append({"xgb": xgb_b, "cat": cat_m, "lgb": lgb_m})
    return models

def predict_behaviors_for_pair(lab_id, video_id, aid, tid, behaviors, test_meta):
    if lab_id != "BoisterousParrot": return None
    frames, feat_df = get_frame_features_no_cache(lab_id, video_id, aid, tid, test_meta, is_test=True)
    if feat_df.empty: return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])
    
    scores = {}
    for bhv in behaviors:
        base = RESULTS_DIR / lab_id / bhv
        if not (base / "ensemble_params.json").exists(): continue
        with open(base / "ensemble_params.json") as f: p = json.load(f)
        ws, th = p["weights"], p["threshold"]
        
        folds = load_ensemble_models(lab_id, bhv)
        if not folds: continue
        
        cols = folds[0]["xgb"].feature_names
        X = pd.DataFrame(0.0, index=feat_df.index, columns=cols, dtype=np.float32)
        c = list(set(cols) & set(feat_df.columns))
        if c: X[c] = feat_df[c]
        dtest = xgb.DMatrix(X, feature_names=cols)
        
        agg = np.zeros(len(feat_df), dtype=np.float32)
        for m in folds:
            px = m["xgb"].predict(dtest)
            pc = m["cat"].predict_proba(X)[:,1] if m["cat"] else np.zeros_like(px)
            pl = m["lgb"].predict(X) if m["lgb"] else np.zeros_like(px)
            
            avg = px*ws.get("xgb", 0.33) + pc*ws.get("cat", 0.33) + pl*ws.get("lgb", 0.33)
            agg += avg * (avg >= th).astype("int8")
        
        if folds: scores[bhv] = agg / len(folds)
        
        del X, dtest
        gc.collect()

    if not scores: return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])
    
    bl = list(scores.keys()); mat = np.vstack([scores[b] for b in bl]).T
    lbls = np.where(mat.max(1)==0, "none", np.array(bl)[mat.argmax(1)])
    
    segs = []; prev = "none"; start = None; pf = None
    for f, l in zip(frames, lbls):
        if l != prev:
            if prev != "none": segs.append({"video_id": int(video_id), "action": prev, "start_frame": int(start), "stop_frame": int(pf)+1})
            prev = l; start = f
        pf = f
    if prev != "none": segs.append({"video_id": int(video_id), "action": prev, "start_frame": int(start), "stop_frame": int(pf)+1})
    
    return pd.DataFrame(segs)

# =========================================================
# 5. MAIN
# =========================================================
target_lab = "BoisterousParrot"

print("\n=== START INFERENCE ===")
test_meta = pd.read_csv(INPUT_DIR / "test.csv")
test_meta = test_meta[test_meta["lab_id"] == target_lab].reset_index(drop=True)

trained = sorted([p.name for p in (RESULTS_DIR/target_lab).iterdir() if p.is_dir()])
sb, pb = [b for b in trained if b in SELF_BEHAVIORS], [b for b in trained if b in PAIR_BEHAVIORS]

all_segs = []
def fid(i): return str(i) if str(i).startswith("mouse") else f"mouse{i}"

for vid in sorted(test_meta["video_id"].unique()):
    print(f"Predicting Video {vid}...")
    tr = load_tracking(target_lab, vid, is_test=True)
    mids = sorted(tr["mouse_id"].unique())
    
    if sb:
        for m in mids:
            df = predict_behaviors_for_pair(target_lab, vid, m, m, sb, test_meta)
            if df is not None and not df.empty:
                df["agent_id"] = fid(m); df["target_id"] = "self"
                all_segs.append(df)
    if pb and len(mids) > 1:
        for a, t in itertools.permutations(mids, 2):
            df = predict_behaviors_for_pair(target_lab, vid, a, t, pb, test_meta)
            if df is not None and not df.empty:
                df["agent_id"] = fid(a); df["target_id"] = fid(t)
                all_segs.append(df)
    del tr
    gc.collect()

cols = ["video_id", "agent_id", "target_id", "action", "start_frame", "stop_frame"]

if all_segs:
    sub2 = pd.concat(all_segs, ignore_index=True)
    sub2 = sub2[cols].sort_values(["video_id", "agent_id", "target_id", "action", "start_frame"]).reset_index(drop=True)    
    sub2.insert(0, "row_id", np.arange(len(sub2), dtype=np.int64))
else:
    sub2 = pd.DataFrame(columns=["row_id"] + cols)

sub2.to_csv(WORKING_DIR / "submission2.csv", index=False)
print(f"\nDone! Saved submission to {WORKING_DIR / 'submission2.csv'}")


=== START INFERENCE ===

Done! Saved submission to /kaggle/working/submission2.csv


# ElegantMink

In [4]:
from pathlib import Path
import shutil
import gc

WORKING_DIR = Path("/kaggle/working")

# 1) Xóa mọi thứ trong /kaggle/working trừ .csv
for path in WORKING_DIR.iterdir():
    # giữ lại file .csv
    if path.is_file() and path.suffix == ".csv":
        continue

    if path.is_file():
        try:
            path.unlink()
        except Exception as e:
            print(f"Cannot remove file {path}: {e}")
    elif path.is_dir():
        try:
            shutil.rmtree(path, ignore_errors=True)
        except Exception as e:
            print(f"Cannot remove dir {path}: {e}")


gc.collect()

75

In [5]:
from __future__ import annotations
from typing import Dict, List, Tuple, Any, Optional
import warnings
from dataclasses import dataclass, field
import numpy as np
import pandas as pd
from scipy.ndimage import gaussian_filter1d
from tqdm import tqdm

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
np.seterr(invalid="ignore", divide="ignore")

# =============================================================================
# 1. CONFIGURATION
# =============================================================================
@dataclass
class FeatureConfig:
    """
    Chứa cấu hình tham số (Hyperparameters).
    """
    fps: float = 30.0
    pix_per_cm: float = 1.0
    smooth_sigma: float = 1.0
    use_pairwise: bool = True


# =============================================================================
# 2. AGENT CONTEXT
# =============================================================================
@dataclass
class AgentContext:
    """
    Container chứa dữ liệu đã tiền xử lý của một con chuột.
    Giúp tránh việc tính toán lại vận tốc/gia tốc nhiều lần.
    """
    idx: pd.Index          # Index frame
    pos: np.ndarray        # [F, 2] cm
    vel: np.ndarray        # [F, 2] cm/s
    speed: np.ndarray      # [F, 1] cm/s
    acc: np.ndarray        # [F, 2] cm/s^2
    
    cx: pd.Series          # Series tọa độ X (để dùng rolling)
    cy: pd.Series          # Series tọa độ Y
    speed_series: pd.Series # Series tốc độ
    
    raw_df: Optional[pd.DataFrame] = None # Dữ liệu gốc các bộ phận 


# =============================================================================
# 3. FEATURE EXTRACTOR
# =============================================================================
class FeatureExtractor:
    """
    Class trích xuất đặc trưng hành vi từ dữ liệu tracking.
    """
    def __init__(self, fps: float, pix_per_cm: float, smooth_sigma: float = 1.0, use_pairwise: bool = True):
        # Map tham số từ init vào Config
        self.cfg = FeatureConfig(
            fps=float(fps), 
            pix_per_cm=float(pix_per_cm), 
            smooth_sigma=smooth_sigma,
            use_pairwise=use_pairwise
        )
        
        # Đăng ký các hàm feature sẽ chạy
        self.feature_registry = {
            "kinematics": self._feat_basic_kinematics,
            "multiscale": self._feat_multiscale,
            "long_range": self._feat_long_range,
            "cumulative": self._feat_cumulative,
            "curvature": self._feat_curvature,
            "speed_asym": self._feat_speed_asym,
            "gauss_shift": self._feat_gauss_shift,
            "pose_shape": self._feat_pose_shape,
            "pairwise": self._feat_pairwise,
            "follow": self._feat_follow_pattern,
            "short": self._feat_shortburst_social,
            "a": self._feat_attack_sniff,
            "b": self._feat_climb,
            "c": self._feat_ejaculate_temporal
        }

    # --- Helpers ---
    def _scale(self, n_frames_30fps: int) -> int:
        """Quy đổi số frame từ chuẩn 30fps sang fps thực tế của video."""
        return max(1, int(round(n_frames_30fps * self.cfg.fps / 30.0)))

    def _to_cm(self, arr):
        """Chuyển pixel -> cm."""
        return arr / self.cfg.pix_per_cm

    def _smooth(self, x):
        """Làm mượt dữ liệu bằng Gaussian filter."""
        if self.cfg.smooth_sigma is None or x.shape[0] < 3: return x
        if np.all(np.isnan(x)): return x
        return gaussian_filter1d(x, sigma=self.cfg.smooth_sigma, axis=0, mode="nearest")

    def _forward_fill_nan(self, pos):
        """
        Điền dữ liệu thiếu (NaN) bằng giá trị hợp lệ trước đó (Forward Fill).
        """
        if np.all(np.isnan(pos)):
            return np.zeros_like(pos)

        pos_ffill = pos.copy()
        mask = np.any(~np.isnan(pos_ffill), axis=1)
        if not mask.any():
            return np.zeros_like(pos_ffill)

        valid_idx = np.where(mask)[0]
        first, last = valid_idx[0], valid_idx[-1]
        pos_ffill[:first] = pos_ffill[first]
        pos_ffill[last + 1:] = pos_ffill[last]
        df_temp = pd.DataFrame(pos_ffill)
        df_temp = df_temp.ffill()
        return df_temp.to_numpy()
    
    def _speed_series(self, cx: pd.Series, cy: pd.Series) -> pd.Series:
        dx = cx.diff()
        dy = cy.diff()
        v = np.hypot(dx, dy).fillna(0.0) * self.cfg.fps
        return v.astype("float32")
    
    def _roll_future_mean(self, s: pd.Series, w: int, min_p: int = 1) -> pd.Series:
        return s.iloc[::-1].rolling(w, min_periods=min_p).mean().iloc[::-1]

    def _roll_future_var(self, s: pd.Series, w: int, min_p: int = 2) -> pd.Series:
        return s.iloc[::-1].rolling(w, min_periods=min_p).var().iloc[::-1]

    # --- Core Logic ---
    def _compute_kinematics(self, pos_px: np.ndarray):
        """
        Tính toán vật lý cơ bản: Pos(cm), Vel, Speed, Acc.
        Input: Array [Frames, 2] (pixel).
        Output: Tuple (pos_cm, vel, speed, acc).
        """
        pos_ffill = self._forward_fill_nan(pos_px)
        pos_cm = self._to_cm(pos_ffill.astype(np.float32))
        pos_cm = self._smooth(pos_cm)                                               # [F, 2]

        dt = 1.0 / self.cfg.fps
        vel = np.zeros_like(pos_cm, dtype=np.float32)
        vel[1:] = (pos_cm[1:] - pos_cm[:-1]) / dt                                   # [F, 2: (vx, vy)]
        speed = np.linalg.norm(vel, axis=1, keepdims=True).astype(np.float32)       # [F, 1]

        acc = np.zeros_like(pos_cm, dtype=np.float32)                          
        acc[1:] = (vel[1:] - vel[:-1]) / dt                                         # [F, 2:(ax, ay)]
        return pos_cm.astype(np.float32), vel, speed, acc

    def _build_context(self, frames, pos_px, mouse_df=None) -> AgentContext:
        """
        Tạo AgentContext chứa đầy đủ thông tin vật lý của 1 con chuột.
        """
        p, v, s, a = self._compute_kinematics(pos_px)
        idx = pd.Index(frames, name="frame")
        
        return AgentContext(
            idx=idx, pos=p, vel=v, speed=s, acc=a, 
            cx=pd.Series(p[:, 0], index=idx), 
            cy=pd.Series(p[:, 1], index=idx), 
            speed_series=pd.Series(s[:, 0], index=idx), 
            raw_df=mouse_df
        )

    # --- Feature Modules ---
    def _feat_basic_kinematics(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Lấy các giá trị thô: tọa độ x, y, vận tốc vx, vy, tốc độ, gia tốc ax, ay.
        """
        return {
            "a_x": ctx.pos[:, 0], "a_y": ctx.pos[:, 1],
            "a_vx": ctx.vel[:, 0], "a_vy": ctx.vel[:, 1],
            "a_speed": ctx.speed[:, 0],
            "a_ax": ctx.acc[:, 0], "a_ay": ctx.acc[:, 1]
        }

    def _feat_multiscale(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Tính tốc độ trung bình (Mean) và độ lệch chuẩn (Std) ở đa mức thời gian.
        Feature 'sp_ratio' đo độ bùng nổ (Burstiness).
        """
        feats = {}
        speed = ctx.speed_series
        frame_scales = [10, 40, 160]
        for scale in frame_scales:
            ws = self._scale(scale)
            if len(speed) >= ws:
                roller = speed.rolling(ws, min_periods=max(1, ws//4), center=True)
                feats[f"sp_m{scale}"] = roller.mean().astype("float32")
                feats[f"sp_s{scale}"] = roller.std().astype("float32")
        feats[f"sp_ratio"] = feats["sp_m10"] / (feats["sp_m160"] + 1e-6)
        return feats 
        
    def _feat_long_range(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Đặc trưng ngữ cảnh dài hạn:
        - x_ml, y_ml: Vị trí trung bình trong quá khứ.
        - sp_pct: Xếp hạng (percentile) của tốc độ hiện tại so với quá khứ.
        """
        feats: Dict[str, pd.Series] = {}
        speed = ctx.speed_series

        for window in [120, 240]:
            ws = self._scale(window)
            if len(ctx.cx) >= ws:
                feats[f"x_ml{window}"] = ctx.cx.rolling(ws, min_periods=max(5, ws // 6), center=True).mean()
                feats[f"y_ml{window}"] = ctx.cy.rolling(ws, min_periods=max(5, ws // 6), center=True).mean()

        for span in [60, 120]:
            s = self._scale(span)
            feats[f"x_e{span}"] = ctx.cx.ewm(span=s, min_periods=1).mean()
            feats[f"y_e{span}"] = ctx.cy.ewm(span=s, min_periods=1).mean()

        for window in [60, 120]:
            ws = self._scale(window)
            if len(speed) >= ws:
                feats[f"sp_pct{window}"] = speed.rolling(
                    ws, min_periods=max(5, ws // 6), center=True
                ).rank(pct=True)
        return feats
    

    def _feat_curvature(self, ctx: AgentContext, **kwargs) -> Dict:
        feats = {}

        vel_x, vel_y = ctx.vel[:, 0], ctx.vel[:, 1]
        acc_x, acc_y = ctx.acc[:, 0], ctx.acc[:, 1]
        cross_prod = vel_x * acc_y - vel_y * acc_x
        vel_mag = np.sqrt(vel_x**2 + vel_y**2)
        moving_mask = vel_mag > 2.0
        vel_mag_safe = np.maximum(vel_mag, 0.1 / self.cfg.fps)
        raw_curv = cross_prod / (vel_mag_safe**3)
        raw_curv = np.where(moving_mask, raw_curv, 0.0)
        min_turn_radius_cm = 0.5
        max_k = 1.0 / min_turn_radius_cm
        raw_curv = np.clip(raw_curv, -max_k, max_k)
        abs_curv = np.abs(raw_curv)
        abs_curv_series = pd.Series(abs_curv, index=ctx.idx)

        for w in [30, 60]:
            ws = self._scale(w)
            min_p = max(ws // 3, 1)
            feats[f"curv_mean_{w}"] = abs_curv_series.rolling(ws, min_periods=min_p).mean()

        angle = np.arctan2(vel_y, vel_x)
        angle_series = pd.Series(angle, index=ctx.idx)
        angle_change = np.abs(angle_series.diff().fillna(0.0))
        angle_change = np.where(angle_change > np.pi, 2 * np.pi - angle_change, angle_change)
        angle_change_series = pd.Series(angle_change, index=ctx.idx)
        angle_change_series = pd.Series(np.where(moving_mask, angle_change_series, 0.0), index=ctx.idx)

        ws = self._scale(30)
        feats["turn_rate_30"] = angle_change_series.rolling(ws, min_periods=max(ws // 3, 1)).sum()

        return feats
    
    def _feat_cumulative(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Tổng quãng đường di chuyển trong một khoảng thời gian dài xung quanh frame hiện tại.
        """
        feats = {}
        L = max(1, self._scale(180))
        step = np.hypot(ctx.cx.diff(), ctx.cy.diff()).fillna(0.0)
        path = step.rolling(2 * L + 1, min_periods=max(5, L // 6), center=True).sum()
        feats["path_cum180"] =  path.fillna(0.0).astype("float32")
        return feats

    def _feat_speed_asym(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Bất đối xứng tốc độ (Tương lai - Quá khứ).
        """
        w = max(3, self._scale(30))
        v = ctx.speed_series
        v_past = v.rolling(w, min_periods=1).mean()
        v_fut = self._roll_future_mean(v, w, min_p=1)
        return {"spd_asym_1s": (v_fut - v_past).fillna(0.0)}
    
    def _feat_gauss_shift(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Độ lệch Gaussian (KL Divergence) giữa quá khứ và tương lai.
        Đo lường sự thay đổi trạng thái thống kê.
        """
        w = max(5, self._scale(30))
        v = ctx.speed_series
        mu_p = v.rolling(w, min_periods=1).mean()
        va_p = v.rolling(w, min_periods=1).var().clip(lower=1e-6)
        mu_f = self._roll_future_mean(v, w, min_p=1)
        va_f = self._roll_future_var(v, w, min_p=1).clip(lower=1e-6)

        kl_pf = 0.5 * (
            (va_p / va_f) + ((mu_f - mu_p) ** 2) / va_f - 1.0 + np.log(va_f / va_p)
        )
        kl_fp = 0.5 * (
            (va_f / va_p) + ((mu_p - mu_f) ** 2) / va_p - 1.0 + np.log(va_p / va_f)
        )
        return {
            "spd_symkl_1s": (kl_pf + kl_fp).replace([np.inf, -np.inf], np.nan).fillna(0.0)
        }
    
    def _extract_part(self, ctx: AgentContext, part: str) -> Optional[np.ndarray]:
        if ctx.raw_df is None: return None
        if part not in ctx.raw_df.columns.get_level_values(0): return None
        try:
            sub_df = ctx.raw_df.xs(part, axis=1, level=0)[["x", "y"]].reindex(ctx.idx)
        except KeyError: return None
        raw = sub_df.to_numpy()
        raw = self._forward_fill_nan(raw)
        cm = self._to_cm(raw.astype(np.float32))
        return self._smooth(cm)
    
    def _extract_parts_dict(self, ctx: AgentContext, parts: List[str] = None) -> Dict[str, Optional[np.ndarray]]:
        out = {}
        for p in parts:
            out[p] = self._extract_part(ctx, p)
        return out
        
    def _feat_pose_shape(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Placeholder cho các đặc trưng hình dáng (Elongation, Body Angle...).
        """
        feats = {}

        def zero(): return pd.Series(0.0, index=ctx.idx, dtype="float32")

        def dist(k1, k2):
            p1, p2 = parts.get(k1), parts.get(k2)
            if p1 is None or p2 is None: return zero()
            d = np.linalg.norm(p1 - p2, axis=1)
            return pd.Series(d, index=ctx.idx, dtype="float32")
        
        def body_angle():
            if parts.get("nose") is None: return zero()
            if parts.get("neck") is None: return zero()
            if parts.get("tail_base") is None: return zero()

            v1 = parts.get("nose") - parts.get("neck")
            v2 = parts.get("tail_base") - parts.get("neck")
            dot_product = np.sum(v1 * v2, axis=1)
            mag = np.linalg.norm(v1, axis=1) * np.linalg.norm(v2, axis=1)
            cos_angle = np.clip(dot_product / (mag + 1e-6), -1.0, 1.0).astype("float32")
            return cos_angle
        
        def elongation():
            if parts.get("nose")          is None: return zero()
            if parts.get("tail_base")     is None: return zero()
            if parts.get("hip_left")  is None: return zero()
            if parts.get("hip_right") is None: return zero()

            d1 = dist("nose", "tail_base")
            d2 = dist("hip_left", "hip_right")
            elongation = d1 / (d2 + 1e-6).astype("float32")
            return elongation
        
        def part_speed(part: str, n_frames_30fps: int) -> Dict:
            part_pos = self._extract_part(ctx, part)
            if part_pos is None: return zero()
            
            s_x = pd.Series(part_pos[:, 0], index=ctx.idx)
            s_y = pd.Series(part_pos[:, 1], index=ctx.idx)
            raw_speed = self._speed_series(s_x, s_y)

            ws = self._scale(n_frames_30fps)
            val = raw_speed.rolling(ws, min_periods=1, center=True).mean()
            return val.astype("float32")


        target_parts = ["nose", "hip_left", "hip_right", "ear_left", "ear_right", "tail_base", "neck"]
        
        parts = self._extract_parts_dict(ctx, target_parts)

        feats["aa_nose_tailbase_dist"]       = dist("nose", "tail_base")
        feats["aa_earleft_tailbase_dist"]    = dist("ear_left", "tail_base")
        feats["aa_earright_tailbase_dist"]   = dist("ear_right", "tail_base")
        feats["aa_nose_earleft_dist"]        = dist("ear_left", "nose")
        feats["aa_nose_ear_right_dist"]      = dist("ear_right", "nose")
        feats["aa_nose_hip_left_dist"]       = dist("nose", "hip_left")
        feats["aa_nose_hip_right_dist"]      = dist("nose", "hip_right")
        feats["aa_neck_tailbase_dist"] = dist("neck", "tail_base")
        
        # feats["a_elongation"]                = elongation()
        feats["a_bodyangle"]                 = body_angle()

        a_tail_base_vel_500ms     = part_speed("tail_base", 15)
        a_tail_base_vel_1000ms    = part_speed("tail_base", 30)
        a_tail_base_vel_2000ms    = part_speed("tail_base", 60)
        a_tail_base_vel_3000ms    = part_speed("tail_base", 90)


        a_hip_left_vel_500ms          = part_speed("hip_left", 15)
        a_hip_left_vel_1000ms         = part_speed("hip_left", 30)
        a_hip_left_vel_2000ms         = part_speed("hip_left", 60)
        a_hip_left_vel_3000ms         = part_speed("hip_left", 90)

        a_hip_right_vel_500ms          = part_speed("hip_left", 15)
        a_hip_right_vel_1000ms         = part_speed("hip_left", 30)
        a_hip_right_vel_2000ms         = part_speed("hip_left", 60)
        a_hip_right_vel_3000ms         = part_speed("hip_left", 90)

        feats["a_upper_vel_500ms"]            = (a_tail_base_vel_500ms + a_hip_left_vel_500ms + a_hip_right_vel_500ms)/3.0
        feats["a_upper_vel_1000ms"]           = (a_tail_base_vel_1000ms + a_hip_left_vel_1000ms + a_hip_right_vel_1000ms)/3.0
        feats["a_upper_vel_2000ms"]           = (a_tail_base_vel_2000ms + a_hip_left_vel_2000ms + a_hip_right_vel_2000ms)/3.0
        feats["a_upper_vel_3000ms"]           = (a_tail_base_vel_3000ms + a_hip_left_vel_3000ms + a_hip_right_vel_3000ms)/3.0


        feats["a_nose_vel_500ms"]            = part_speed("nose", 15)
        feats["a_nose_vel_1000ms"]           = part_speed("nose", 30)
        feats["a_nose_vel_2000ms"]           = part_speed("nose", 60)
        feats["a_nose_vel_3000ms"]           = part_speed("nose", 90)

        # feats["a_ear_right_vel_500ms"]       = part_speed("hip_right", 15)
        # feats["a_ear_right_vel_1000ms"]      = part_speed("hip_right", 30)
        # feats["a_ear_right_vel_2000ms"]      = part_speed("hip_right", 60)
        # feats["a_ear_right_vel_3000ms"]      = part_speed("hip_right", 90)
        # feats["a_ear_left_vel_500ms"]        = part_speed("ear_left", 15)
        # feats["a_ear_left_vel_1000ms"]       = part_speed("ear_left", 30)
        # feats["a_ear_left_vel_2000ms"]       = part_speed("ear_left", 60)
        # feats["a_ear_left_vel_3000ms"]       = part_speed("ear_left", 90)
        
        return feats

    def _feat_attack_sniff(
        self,
        ctx: AgentContext,
        target_ctx: AgentContext = None,
        **kwargs
    ) -> Dict[str, pd.Series]:
        """
        Đặc trưng phân biệt attack vs sniff cho lab 2-mouse (agent=1, target=2).
    
        Ý tưởng:
          - attack: speed 2 con biến động mạnh, đổi hướng nhiều, body overlap cao.
          - sniff : mũi gần cổ/thân, overlap thấp hơn, motion nhẹ/ổn định hơn.
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
    
        def zero():
            return pd.Series(0.0, index=idx, dtype="float32")

        # helper khoảng cách
        def dist(p1, p2):
            if p1 is None or p2 is None:
                return zero()
            d = np.linalg.norm(p1 - p2, axis=1)
            return pd.Series(d, index=idx, dtype="float32")

        parts_a = self._extract_parts_dict(ctx, ["nose", "tail_base"])
        parts_t = self._extract_parts_dict(target_ctx, ["nose", "tail_base"])
    
        # ---------------------------------------------------------
        # 2) ĐIỂM ĐẠI DIỆN THÂN (BODY CENTER) CHO MỖI CON
        #    dùng trung bình neck – hips – tail_base
        # ---------------------------------------------------------
    
        # ---------------------------------------------------------
        # 4) MỨC ĐỘ “BẠO LỰC”: DAO ĐỘNG TỐC ĐỘ & ĐỔI HƯỚNG
        # ---------------------------------------------------------
        # speed 2 con từ velocity
        a_speed = pd.Series(
            np.linalg.norm(ctx.vel, axis=1),
            index=idx,
            dtype="float32",
        )
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32",
        )

        ws_05 = self._scale(15)  # ~0.5s
        mp_05 = max(ws_05 // 3, 1)
    
        feats["as_a_speed_std_05"] = (
            a_speed.rolling(ws_05, min_periods=mp_05).std().fillna(0.0).astype("float32")
        )
        feats["as_t_speed_std_05"] = (
            t_speed.rolling(ws_05, min_periods=mp_05).std().fillna(0.0).astype("float32")
        )
        feats["as_speed_std_sum_05"] = (
            feats["as_a_speed_std_05"] + feats["as_t_speed_std_05"]
        )
    
        # Đổi hướng (jerk góc) của agent
        a_angle = np.arctan2(ctx.vel[:, 1], ctx.vel[:, 0])
        a_angle_diff = np.abs(np.diff(a_angle))
        a_angle_diff = np.where(
            a_angle_diff > np.pi, 2 * np.pi - a_angle_diff, a_angle_diff
        )
        a_angle_diff = np.concatenate([[0.0], a_angle_diff])
        a_angle_diff_s = pd.Series(a_angle_diff, index=idx, dtype="float32")
    
        feats["as_a_turn_jerk_05"] = (
            a_angle_diff_s.rolling(ws_05, min_periods=mp_05)
            .sum()
            .fillna(0.0)
            .astype("float32")
        )

        # ---------------------------------------------------------
        # 5) XẤP XỈ OVERLAP CƠ THỂ (BODY OVERLAP)
        #    dùng bbox từ các bộ phận thân
        # ---------------------------------------------------------
        def build_bbox(parts: Dict[str, Optional[np.ndarray]]):
            arrs = []
            for k in ["nose", "hip_left", "hip_right", "ear_left", "ear_right", "tail_base"]:
                if parts.get(k) is not None:
                    arrs.append(parts[k])
            if not arrs:
                return None
            stack = np.stack(arrs, axis=1)  # [F, K, 2]
            xs = stack[:, :, 0]
            ys = stack[:, :, 1]
            xmin = np.nanmin(xs, axis=1)
            xmax = np.nanmax(xs, axis=1)
            ymin = np.nanmin(ys, axis=1)
            ymax = np.nanmax(ys, axis=1)
            return np.stack([xmin, ymin, xmax, ymax], axis=1).astype("float32")
    
        def iou_box(box1: np.ndarray, box2: np.ndarray):
            # box: [F, 4] = (xmin, ymin, xmax, ymax)
            x1 = np.maximum(box1[:, 0], box2[:, 0])
            y1 = np.maximum(box1[:, 1], box2[:, 1])
            x2 = np.minimum(box1[:, 2], box2[:, 2])
            y2 = np.minimum(box1[:, 3], box2[:, 3])
    
            inter_w = np.clip(x2 - x1, 0.0, None)
            inter_h = np.clip(y2 - y1, 0.0, None)
            inter = inter_w * inter_h
    
            area1 = (box1[:, 2] - box1[:, 0]) * (box1[:, 3] - box1[:, 1])
            area2 = (box2[:, 2] - box2[:, 0]) * (box2[:, 3] - box2[:, 1])
            union = area1 + area2 - inter + 1e-6
            iou = inter / union
            return iou.astype("float32")

        bbox_a = build_bbox(parts_a)
        bbox_t = build_bbox(parts_t)
        if bbox_a is not None and bbox_t is not None:
            iou = iou_box(bbox_a, bbox_t)
            iou_s = pd.Series(iou, index=idx, dtype="float32")
    
            feats["as_body_iou"] = iou_s
    
            ws_1s = self._scale(30)
            mp_1s = max(ws_1s // 3, 1)
            feats["as_body_iou_mean_1s"] = (
                iou_s.rolling(ws_1s, min_periods=mp_1s).mean().fillna(0.0).astype("float32")
            )
        else:
            feats["as_body_iou"] = zero()
            feats["as_body_iou_mean_1s"] = zero()
    
        # ---------------------------------------------------------
        # 6) DỌN NẠN NaN / Inf
        # ---------------------------------------------------------
        for k, v in feats.items():
            feats[k] = (
                v.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats

    def _feat_climb(self, ctx: AgentContext, **kwargs) -> Dict[str, pd.Series]:
        """
        Feature chuyên cho hành vi climb trong arena hình chữ nhật (33 x 19 cm).
    
        Ý tưởng:
          - Chuột đi gần tường: dist_wall giảm nhanh.
          - Khi climb: sát tường (dist_wall nhỏ), v_normal ~ 0,
            nhưng vẫn có v_tangent (bò ngang trên tường / di chuyển dọc biên).
        """
        feats: Dict[str, pd.Series] = {}
        idx = ctx.idx
    
        def zero() -> pd.Series:
            return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- 1. Arena size (cm) ---
        # Nếu bạn đã set trong FeatureConfig thì dùng:
        # W = self.cfg.arena_width_cm or 33.0
        # H = self.cfg.arena_height_cm or 19.0
        # Ở đây fix luôn cho lab này:
        W = 28.0
        H = 18.0
        parts = self._extract_parts_dict(ctx, ["nose"])
        head = parts.get("nose")
        
        if head is not None:
            # head đã ở đơn vị cm (vì _extract_part đã to_cm + smooth)
            cx = pd.Series(head[:, 0], index=idx)
            cy = pd.Series(head[:, 1], index=idx)
        else:
            # fallback: nếu không có head thì dùng body_center như cũ
            cx = ctx.cx
            cy = ctx.cy


        # # --- 2. Khoảng cách tới 4 bức tường ---
        # cx = ctx.cx  # Series
        # cy = ctx.cy  # Series
    
        dist_left   = cx - 0.0
        dist_right  = W - cx
        dist_bottom = cy - 0.0
        dist_top    = H - cy
    
        d_all = np.stack(
            [dist_left.values, dist_right.values, dist_bottom.values, dist_top.values],
            axis=1,  # [F, 4]
        )
    
        dist_wall = np.min(d_all, axis=1)          # khoảng cách tới tường gần nhất
        wall_idx  = np.argmin(d_all, axis=1)       # 0:left, 1:right, 2:bottom, 3:top
    
        dist_wall_s = pd.Series(dist_wall, index=idx, dtype="float32")
        feats["climb_dist_wall"] = dist_wall_s
    
        # --- 3. Vận tốc theo NORMAL & TANGENT của tường gần nhất ---
        vx = ctx.vel[:, 0]
        vy = ctx.vel[:, 1]
    
        # normal hướng VÀO trong arena từ tường
        nx = np.zeros_like(vx, dtype="float32")
        ny = np.zeros_like(vy, dtype="float32")

        # left  wall (x=0)    → normal = (+1, 0)
        # right wall (x=W)    → normal = (-1, 0)
        # bottom wall (y=0)   → normal = (0, +1)
        # top wall (y=H)      → normal = (0, -1)
        nx[wall_idx == 0] =  1.0
        nx[wall_idx == 1] = -1.0
        ny[wall_idx == 2] =  1.0
        ny[wall_idx == 3] = -1.0
    
        # v_normal = v ⋅ n
        v_normal = vx * nx + vy * ny
    
        # thành phần song song tường: v_tan = v - (v⋅n)n
        v_proj_x = v_normal * nx
        v_proj_y = v_normal * ny
        v_tan_x = vx - v_proj_x
        v_tan_y = vy - v_proj_y
        v_tangent = np.sqrt(v_tan_x ** 2 + v_tan_y ** 2)
    
        v_normal_s  = pd.Series(v_normal,  index=idx, dtype="float32")
        v_tangent_s = pd.Series(v_tangent, index=idx, dtype="float32")
    
        feats["climb_normal_vel"]  = v_normal_s
        feats["climb_tangent_vel"] = v_tangent_s
    
        # --- 4. Approach speed: dist_wall giảm mạnh (lao vào tường) ---
        ws = self._scale(15)  # ~0.5s (15 frame ở 30fps)
        min_p = max(ws // 3, 1)

        # diff_dw > 0 khi dist_wall giảm (đi về phía tường)
        diff_dw = -dist_wall_s.diff().fillna(0.0)  # dấu trừ để "giảm" → dương
        approach = diff_dw.rolling(ws, min_periods=min_p).mean()
        feats["climb_approach_speed_wall"] = approach.astype("float32")
    
        # --- 5. Stick score: sát tường + không còn lao vào (v_normal nhỏ) ---
        # gần tường
        thr_cm = 3.0  # tuỳ chỉnh (3cm sát tường)
        near_wall = (dist_wall_s < thr_cm).astype("float32")
    
        # ít lao vào nữa: |v_normal| nhỏ
        stick = near_wall * (1.0 / (1.0 + v_normal_s.abs()))

        # Nếu muốn climb thực sự có chút chuyển động dọc tường:
        # yêu cầu v_tangent > một ngưỡng nhỏ (ví dụ 0.5 cm/s)
        stick = stick * (v_tangent_s > 0.5).astype("float32")
    
        feats["climb_wall_stick_score"] = stick.astype("float32")
    
        # --- 6. Clean NaN/Inf ---
        for k, v in feats.items():
            feats[k] = (
                v.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats


    def _feat_pairwise(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict:
        """
        Đặc trưng tương tác cặp đôi (Pairwise): Khoảng cách, Tốc độ tiếp cận.
        """
        feats = {}
        if target_ctx is None: 
            return feats

        def zero(): return pd.Series(0.0, index=ctx.idx, dtype="float32")

        def dist_ab(pt_a, pt_b):
            if pt_a is None or pt_b is None: return zero()
            d = np.linalg.norm(pt_a - pt_b, axis=1)
            return pd.Series(d, index=ctx.idx, dtype="float32")

        rel_vec = target_ctx.pos - ctx.pos
        dist = np.linalg.norm(rel_vec, axis=1)
        feats["rel_dist"] = pd.Series(dist, index=ctx.idx, dtype="float32")

        # Khoảng cách
        my_parts = self._extract_parts_dict(ctx, ["nose", "ear_left", "ear_right", "body_center", "tail_base", "hip_left", "hip_right", "neck"])
        target_parts = self._extract_parts_dict(target_ctx, ["nose", "ear_left", "ear_right", "body_center", "tail_base", "hip_left", "hip_right", "neck"])

        an, tn = my_parts["nose"], target_parts["nose"]
        feats["dist_nose_nose"] = dist_ab(an, tn)
        feats["dist_nose_tail"] = dist_ab(an, target_parts["tail_base"])
        feats["dist_nose_el"]   = dist_ab(an, target_parts["ear_left"])
        feats["dist_nose_er"]   = dist_ab(an, target_parts["ear_right"])
        feats["dist_nose_hip_l"] = dist_ab(an, target_parts["hip_left"])
        feats["dist_nose_hip_r"] = dist_ab(an, target_parts["hip_right"])
        feats["dist_nose_neck"] = dist_ab(an, target_parts["neck"])

        
        #  Hướng - góc nhìn
        def get_body_vec(parts_dict):
            head = parts_dict.get("nose")
            tail = parts_dict.get("tail_base")
            if head is not None and tail is not None:
                return head - tail
            return None

        a_vec = get_body_vec(my_parts)
        t_vec = get_body_vec(target_parts)

        if a_vec is not None and t_vec is not None:
            dot = np.sum(a_vec * t_vec, axis=1)
            mags = np.linalg.norm(a_vec, axis=1) * np.linalg.norm(t_vec, axis=1)
            feats["body_cosine"] = pd.Series(
                np.clip(dot / (mags + 1e-6), -1.0, 1.0), index=ctx.idx, dtype="float32"
            )
        else:
            feats["body_cosine"] = zero()

        # Vector ánh nhìn = Target_Pos - My_Pos = rel_vec
        if a_vec is not None:
            dot_gaze = np.sum(a_vec * rel_vec, axis=1)
            mag_a = np.linalg.norm(a_vec, axis=1)
            feats["gaze_cosine"] = pd.Series(
                np.clip(dot_gaze / (mag_a * dist + 1e-6), -1.0, 1.0),
                index=ctx.idx, dtype="float32"
            )
        else:
            feats["gaze_cosine"] = zero()

        # Vector đơn vị hướng về địch (u)
        dist_safe = dist.copy()
        dist_safe[dist_safe == 0] = 1e-6
        u_vec = rel_vec / dist_safe[:, None]

        # a_vel và t_vel lấy từ Context
        a_vel, t_vel = ctx.vel, target_ctx.vel

        # A. Approach Speed (Vận tốc dọc trục nối 2 con)
        # Dương: Lao vào nhau | Âm: Chạy ra xa nhau
        a_along = np.sum(a_vel * u_vec, axis=1)
        t_along = np.sum(t_vel * (-u_vec), axis=1) # Target hướng ngược lại
        rel_along = np.sum((a_vel - t_vel) * u_vec, axis=1)

        # B. Lateral Speed (Vận tốc ngang - Vuông góc trục nối)
        # Vector chiếu: v_proj = (v . u) * u
        a_proj = a_along[:, None] * u_vec
        a_lat_vec = a_vel - a_proj
        a_lat_speed = np.linalg.norm(a_lat_vec, axis=1)

        feats["approach_speed_agent"]  = pd.Series(a_along, index=ctx.idx, dtype="float32")
        feats["approach_speed_target"] = pd.Series(t_along, index=ctx.idx, dtype="float32")
        feats["approach_speed_rel"]    = pd.Series(rel_along, index=ctx.idx, dtype="float32")
        feats["lateral_speed_agent"]   = pd.Series(a_lat_speed, index=ctx.idx, dtype="float32")

        return feats


    def _feat_ejaculate_temporal(
        self,
        ctx: AgentContext,
        target_ctx: AgentContext = None,
        **kwargs
    ) -> Dict[str, pd.Series]:
        """
        Đặc trưng cho hành vi 'ejaculate' (pair):
          - 2 con dính sát, agent gần vùng đuôi/genital của target.
          - Trước đó có giai đoạn hoạt động mạnh (mount/intromit/thrust).
          - Thời điểm ejaculate: agent gần như đứng yên nhưng vẫn sát target.
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
    
        def zero() -> pd.Series:
            return pd.Series(0.0, index=idx, dtype="float32")
    
        # -------------------------------------------------
        # 1. PARTS: APPROX GENITAL & BODY
        # -------------------------------------------------
        # Agent: dùng body_center + nose
        parts_a = self._extract_parts_dict(
            ctx,
            ["nose", "body_center", "tail_base", "hip_left", "hip_right"]
        )
        # Target: genital ~ tail_base, thân ~ body_center
        parts_t = self._extract_parts_dict(
            target_ctx,
            ["body_center", "tail_base"]
        )
    
        a_nose = parts_a.get("nose")
        a_bc   = parts_a.get("body_center")
        a_tail = parts_a.get("tail_base")
        t_bc   = parts_t.get("body_center")
        t_tail = parts_t.get("tail_base")

        # fallback body_center nếu thiếu
        if a_bc is None and a_tail is not None:
            a_bc = a_tail
        if t_bc is None and t_tail is not None:
            t_bc = t_tail
    
        def dist_series(p1: Optional[np.ndarray],
                        p2: Optional[np.ndarray]) -> pd.Series:
            if p1 is None or p2 is None:
                return zero()
            d = np.linalg.norm(p1 - p2, axis=1).astype("float32")
            return pd.Series(d, index=idx, dtype="float32")
    
        # khoảng cách thân–thân và agent body → target genital
        dist_body = dist_series(a_bc, t_bc)          # "ôm" nhau chặt hay không
        dist_gen  = dist_series(a_bc, t_tail)       # agent body gần đuôi target
        dist_nose_gen = dist_series(a_nose, t_tail) # mũi agent gần genital
    
        feats["ejac_dist_body"]      = dist_body
        feats["ejac_dist_gen_body"]  = dist_gen
        feats["ejac_dist_gen_nose"]  = dist_nose_gen
    
        # -------------------------------------------------
        # 2. PROXIMITY SCORE (khoảng cách nhỏ → score lớn)
        # -------------------------------------------------
        # scale ~ 5 cm, có thể chỉnh nếu arena nhỏ/lớn
        prox_body = np.exp(-dist_body.to_numpy() / 5.0).astype("float32")
        prox_gen  = 1.0 / (1.0 + dist_gen.to_numpy())
        prox_nose = 1.0 / (1.0 + dist_nose_gen.to_numpy())
    
        feats["ejac_prox_body"] = pd.Series(prox_body, index=idx, dtype="float32")
        feats["ejac_prox_gen"]  = pd.Series(prox_gen,  index=idx, dtype="float32")
        feats["ejac_prox_nose_gen"] = pd.Series(prox_nose, index=idx, dtype="float32")
    
        # -------------------------------------------------
        # 3. BUILD-UP MEMORY: HOẠT ĐỘNG MẠNH TRƯỚC ĐÓ
        # -------------------------------------------------
        # dung speed của agent nhưng chỉ tính khi đang dính sát body
        v = ctx.speed_series  # cm/s
        close_mask = (dist_body < 5.0).astype("float32")  # ở rất gần
        v_contact = (v * close_mask).astype("float32")

        ws_mem = self._scale(90)  # ~3s
        ws_mem = max(ws_mem, 1)
    
        ejac_mem = (
            v_contact.rolling(ws_mem, min_periods=1)
                     .max()
                     .fillna(0.0)
                     .astype("float32")
        )
        feats["ejac_activity_memory_3s"] = ejac_mem
    
        # -------------------------------------------------
        # 4. HIỆN TẠI: ĐỨNG YÊN NHƯNG VẪN DÍNH SÁT
        # -------------------------------------------------
        # agent gần như đứng yên
        is_still = (v < 1.5).astype("float32")  # ngưỡng speed thấp, tuỳ lab
        feats["ejac_is_still"] = is_still
    
        # khoảng cách ổn định (không kéo xa/đẩy gần quá nhanh)
        dist_body_diff = dist_body.diff().abs().fillna(0.0)
        feats["ejac_dist_body_diff"] = dist_body_diff.astype("float32")
    
        # -------------------------------------------------
        # 5. FINAL SCORE (gợi ý): cao khi ejaculate
        # -------------------------------------------------
        # điều kiện:
        #  - trước đó hoạt động mạnh (ejac_mem lớn)
        #  - bây giờ đứng yên (is_still ~1)
        #  - vẫn dính sát, gần vùng genital
        prox_comb = (
            feats["ejac_prox_body"] *
            feats["ejac_prox_gen"]  *
            feats["ejac_prox_nose_gen"]
        )
    
        feats["ejac_static_score"] = (
            is_still * prox_comb * ejac_mem
        ).astype("float32")
    
        # -------------------------------------------------
        # 6. CLEAN NaN / Inf
        # -------------------------------------------------
        for k, s in feats.items():
            feats[k] = (
                s.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats


    def _feat_follow_pattern(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        """
        Đặc trưng hành vi FOLLOW:
          - Agent ở gần target
          - Cùng hướng (body + velocity)
          - Tốc độ vừa phải
          - Khoảng cách tương đối ổn định trong 0.5–1s
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- 1. CÁC ĐẠI LƯỢNG CƠ BẢN ---
        # Vector Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        rel_dist = np.linalg.norm(rel_vec, axis=1)
        rel_dist_s = pd.Series(rel_dist, index=idx, dtype="float32")
    
        # Speed agent/target
        a_speed = ctx.speed_series.astype("float32")
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32",
        )
    
        # Body vector: nose - tail/body_center
        parts_a = self._extract_parts_dict(ctx, ["nose", "ear_left", "ear_right", "body_center", "tail_base", "hip_left", "hip_right", "neck"])
        parts_t = self._extract_parts_dict(target_ctx, ["nose", "ear_left", "ear_right", "body_center", "tail_base", "hip_left", "hip_right", "neck"])
    
        def body_vec(parts_dict):
            head = parts_dict.get("nose")
            tail = parts_dict.get("tail_base")
            if head is None or tail is None:
                return None
            return head - tail
    
        a_body = body_vec(parts_a)
        t_body = body_vec(parts_t)
    
        if a_body is not None and t_body is not None:
            dot_bt = np.sum(a_body * t_body, axis=1)
            mag_bt = np.linalg.norm(a_body, axis=1) * np.linalg.norm(t_body, axis=1)
            cos_body = np.clip(dot_bt / (mag_bt + 1e-6), -1.0, 1.0)
            cos_body_s = pd.Series(cos_body, index=idx, dtype="float32")
        else:
            cos_body_s = zero()
    
        # Velocity hướng
        a_vel = ctx.vel
        t_vel = target_ctx.vel
        a_speed_np = np.linalg.norm(a_vel, axis=1)
        t_speed_np = np.linalg.norm(t_vel, axis=1)
        moving_mask = (a_speed_np > 1e-3) & (t_speed_np > 1e-3)
    
        # cos giữa hướng velocity 2 con
        dot_v = np.sum(a_vel * t_vel, axis=1)
        mag_v = a_speed_np * t_speed_np + 1e-6
        cos_vel = np.zeros_like(dot_v, dtype="float32")
        cos_vel[moving_mask] = np.clip(dot_v[moving_mask] / mag_v[moving_mask], -1.0, 1.0)
        cos_vel_s = pd.Series(cos_vel, index=idx, dtype="float32")
    
        # --- 2. WINDOW NGẮN (FOLLOW LÀ PATTERN DÀI HƠN ATTACK) ---
        for w30 in [15, 30, 60]:   # ~0.5s, 1s, 2s
            ws = self._scale(w30)
            min_p = max(ws // 3, 1)
    
            # Khoảng cách trung bình & độ dao động
            m_dist = rel_dist_s.rolling(ws, min_periods=min_p).mean()
            s_dist = rel_dist_s.rolling(ws, min_periods=min_p).std()
    
            # Cùng hướng (body + velocity)
            m_cos_body = cos_body_s.rolling(ws, min_periods=min_p).mean()
            m_cos_vel  = cos_vel_s.rolling(ws, min_periods=min_p).mean()
    
            # Tốc độ vừa phải
            m_sp_a = a_speed.rolling(ws, min_periods=min_p).mean()
            m_sp_t = t_speed.rolling(ws, min_periods=min_p).mean()
    
            feats[f"follow_dist_mean_{w30}"] = m_dist
            feats[f"follow_dist_std_{w30}"]  = s_dist
            feats[f"follow_cos_body_mean_{w30}"] = m_cos_body
            feats[f"follow_cos_vel_mean_{w30}"]  = m_cos_vel
            feats[f"follow_speed_agent_mean_{w30}"] = m_sp_a
            feats[f"follow_speed_target_mean_{w30}"] = m_sp_t
    
        # Clean
        for k, v in feats.items():
            feats[k] = (
                v.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats
    
    def _feat_shortburst_social(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        feats = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- Lấy lại vài quantity cơ bản từ pairwise/avoidance ---
        # vector Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        rel_dist = np.linalg.norm(rel_vec, axis=1)
        rel_dist_s = pd.Series(rel_dist, index=idx, dtype="float32")
    
        # unit vector
        rel_dist_safe = np.where(rel_dist == 0, 1e-6, rel_dist)
        u_vec = rel_vec / rel_dist_safe[:, None]
    
        # velocity dọc trục nối (approach speed)
        a_vel = ctx.vel
        t_vel = target_ctx.vel
        a_along = np.sum(a_vel * u_vec, axis=1)                # +: lao vào target
        t_along = np.sum(t_vel * (-u_vec), axis=1)             # +: target lao vào agent
        rel_along = np.sum((a_vel - t_vel) * u_vec, axis=1)    # +: lại gần nhau
    
        a_along_s = pd.Series(a_along, index=idx, dtype="float32")
        t_along_s = pd.Series(t_along, index=idx, dtype="float32")
        rel_along_s = pd.Series(rel_along, index=idx, dtype="float32")
    
        # speed agent / target
        a_speed = ctx.speed_series
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32"
        )
    
        # heading_rel_cos ~ escape / approach
        # vector body của agent
        # (reuse idea từ _feat_pairwise)
        # head ~ nose, tail ~ tail_base/body_center
        parts_a = self._extract_parts_dict(ctx, ["nose", "tail_base"])
        head_a = parts_a.get("nose")
        tail_a = parts_a.get("tail_base")
    
        if head_a is not None and tail_a is not None:
            body_vec_a = head_a - tail_a
            dot = np.sum(body_vec_a * rel_vec, axis=1)
            mag = np.linalg.norm(body_vec_a, axis=1) * rel_dist_safe
            heading_cos = np.clip(dot / (mag + 1e-6), -1.0, 1.0)
            heading_cos_s = pd.Series(heading_cos, index=idx, dtype="float32")
        else:
            heading_cos_s = zero()
    
        # --- Rolling window 10, 20, 30 frames (ở fps gốc) ---
        for w30 in [10, 20, 30]:
            ws = self._scale(w30)
            min_p = max(1, ws // 3)
    
            # Attack-like: approach mạnh, khoảng cách giảm nhanh
            feats[f"sb_att_approach_mean_{w30}"] = a_along_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_att_rel_along_mean_{w30}"] = rel_along_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_att_dist_delta_{w30}"] = (rel_dist_s - rel_dist_s.shift(ws)).fillna(0.0)
    
            # Chase-like: agent & target đều nhanh, dist tương đối nhỏ
            feats[f"sb_chase_speed_agent_mean_{w30}"] = a_speed.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_chase_speed_target_mean_{w30}"] = t_speed.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_chase_dist_mean_{w30}"] = rel_dist_s.rolling(ws, min_periods=min_p).mean()
    
            # Escape-like: heading ngược, dist tăng nhanh
            feats[f"sb_esc_heading_cos_mean_{w30}"] = heading_cos_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_esc_dist_gain_{w30}"] = (rel_dist_s.shift(-ws) - rel_dist_s).fillna(0.0)
    
        # clip & fillna
        for k, v in feats.items():
            feats[k] = v.replace([np.inf, -np.inf], np.nan).fillna(0.0).astype("float32")
    
        return feats

    
    def build_pose_tensor(self, tracking: pd.DataFrame):
        """
        Chuyển dữ liệu tracking (DataFrame) sang Tensor [Frames, Mice, 2] và Dict chi tiết.
        """
        tracking = tracking.sort_values("video_frame")
        frames = np.sort(tracking["video_frame"].unique())
        
        pvid = tracking.pivot(
            index="video_frame", 
            columns=["mouse_id", "bodypart"], 
            values=["x", "y"]
        )
        pvid = pvid.reorder_levels([1, 2, 0], axis=1).sort_index(axis=1).astype("float32")
        mouse_ids = list(pvid.columns.get_level_values(0).unique())
        pos = np.full((len(frames), len(mouse_ids), 2), np.nan, dtype=np.float32)
        per_mouse_df = {}
        
        for i, mid in enumerate(mouse_ids):
            single = pvid[mid]
            per_mouse_df[mid] = single
            
            if "body_center" in single.columns.get_level_values(0):
                cx = single["body_center"]["x"]
                cy = single["body_center"]["y"]
            else:
                cx = single.xs("x", level=1, axis=1).mean(axis=1)
                cy = single.xs("y", level=1, axis=1).mean(axis=1)
            
            pos[:, i, 0] = cx.reindex(frames).values
            pos[:, i, 1] = cy.reindex(frames).values
            
        return frames, mouse_ids, pos, per_mouse_df

    def extract_agent_target(
        self, 
        frames: np.ndarray, 
        mouse_ids: List[Any], 
        pos: np.ndarray, 
        agent_id: Any, 
        target_id: Any, 
        per_mouse_df: Dict = None
    ) -> pd.DataFrame:
        """
        Trích xuất đặc trưng cho cặp (Agent, Target).
        """
        try:
            aid_idx = mouse_ids.index(agent_id)
        except ValueError:
            return pd.DataFrame() 

        # 1. Build Agent Context
        ctx_agent = self._build_context(
            frames, 
            pos[:, aid_idx, :], 
            per_mouse_df.get(agent_id) if per_mouse_df else None
        )

        # 2. Build Target Context
        ctx_target = None
        if self.cfg.use_pairwise and target_id is not None and target_id in mouse_ids:
             tid_idx = mouse_ids.index(target_id)
             ctx_target = self._build_context(
                 frames, 
                 pos[:, tid_idx, :], 
                 per_mouse_df.get(target_id) if per_mouse_df else None
             )

        # 3. Run all features
        all_data = {}
        for func_name, func in self.feature_registry.items():
            out_dict = func(ctx_agent, target_ctx=ctx_target)
            all_data.update(out_dict)

        df_out = pd.DataFrame(all_data, index=ctx_agent.idx)
        df_out = df_out.replace([np.inf, -np.inf], np.nan).fillna(0.0)
        
        return df_out.reindex(sorted(df_out.columns), axis=1)


# =============================================================================================



from __future__ import annotations
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import gc
import itertools
import json
import time
from collections import defaultdict

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedGroupKFold

# (Trên Kaggle) dùng metric chính thức
import sys
sys.path.append("/kaggle/usr/lib/mabe-f-beta")
from metric import score   # hàm score(submission_df, dataset_df)

# =========================================================
# 1. ĐƯỜNG DẪN & CẤU HÌNH
# =========================================================

INPUT_DIR = Path("/kaggle/input/MABe-mouse-behavior-detection")
TRAIN_TRACKING_DIR = INPUT_DIR / "train_tracking"
TRAIN_ANNOTATION_DIR = INPUT_DIR / "train_annotation"
TEST_TRACKING_DIR = INPUT_DIR / "test_tracking"


WORKING_DIR = Path("/kaggle/working")
RESULTS_DIR = Path(r"/kaggle/input/results-xgb-fe")
RESULTS_DIR.mkdir(exist_ok=True, parents=True)

INDEX_COLS = ["video_id", "agent_id", "target_id", "video_frame"]

# hành vi “self” vs “pair” giống notebook (có thể chỉnh nếu muốn)
SELF_BEHAVIORS = [
    "biteobject", "climb", "dig", "exploreobject", "freeze",
    "genitalgroom", "huddle", "rear", "rest", "run", "selfgroom",
]
PAIR_BEHAVIORS = [
    "allogroom", "approach", "attack", "attemptmount", "avoid",
    "chase", "chaseattack", "defend", "disengage", "dominance",
    "dominancegroom", "dominancemount", "ejaculate", "escape",
    "flinch", "follow", "intromit", "mount", "reciprocalsniff",
    "shepherd", "sniff", "sniffbody", "sniffface", "sniffgenital",
    "submit", "tussle",
]


# =========================================================
# 2. ĐỌC METADATA & HELPER
# =========================================================

def load_metadata() -> pd.DataFrame:
    train_meta = pd.read_csv(INPUT_DIR / "train.csv")
    return train_meta


def get_video_params(video_id: Any, meta: pd.DataFrame) -> Tuple[float, float]:
    """Lấy fps, pix_per_cm cho video từ train.csv."""
    row = meta.loc[meta["video_id"] == video_id]
    if row.empty:
        raise KeyError(f"video_id={video_id} không có trong train.csv")
    row = row.iloc[0]

    # giống notebook: cột "frames per second" & "pix per cm (approx)"
    fps = float(row["frames_per_second"])
    pix_per_cm = float(row["pix_per_cm_approx"])
    if not np.isfinite(pix_per_cm) or pix_per_cm <= 0:
        pix_per_cm = 1.0
    return fps, pix_per_cm


def load_tracking(lab_id: str, video_id: Any) -> pd.DataFrame:
    """Đọc tracking parquet → pandas (schema: video_frame, mouse_id, bodypart, x, y)."""
    path = TRAIN_TRACKING_DIR / str(lab_id) / f"{video_id}.parquet"
    if not path.exists():
        raise FileNotFoundError(path)
    df = pd.read_parquet(path)
    return df

def load_tracking_test(lab_id: str, video_id: Any) -> pd.DataFrame:
    """Đọc tracking parquet của test → pandas."""
    path = INPUT_DIR / "test_tracking" / str(lab_id) / f"{video_id}.parquet"
    if not path.exists():
        raise FileNotFoundError(path)
    return pd.read_parquet(path)


def load_annotation(lab_id: str, video_id: Any) -> pd.DataFrame:
    """Đọc annotation (agent_id, target_id, action, start_frame, stop_frame)."""
    path = TRAIN_ANNOTATION_DIR / str(lab_id) / f"{video_id}.parquet"
    if not path.exists():
        # không có label cho video này
        return pd.DataFrame(
            columns=["agent_id", "target_id", "action", "start_frame", "stop_frame"]
        )
    ann = pd.read_parquet(path)
    return ann[["agent_id", "target_id", "action", "start_frame", "stop_frame"]]


# =========================================================
# 3. TÍNH FEATURE PER-FRAME BẰNG FEATUREEXTRACTOR
# =========================================================

# Cache: (lab, video, agent, target) -> (frames, feature_df)
_feature_cache: Dict[Tuple[str, int, int, int], Tuple[np.ndarray, pd.DataFrame]] = {}


def get_frame_features_for_pair(
    lab_id: str,
    video_id: int,
    agent_id: int,
    target_id: int,
    meta: pd.DataFrame,
) -> Tuple[np.ndarray, pd.DataFrame]:
    """
    Tính (hoặc lấy cache) feature per-frame cho 1 video + (agent, target).
    Trả về: frames [F], features_df [F, D]
    """
    key = (str(lab_id), int(video_id), int(agent_id), int(target_id))
    if key in _feature_cache:
        return _feature_cache[key]

    fps, pix_per_cm = get_video_params(video_id, meta)
    tracking = load_tracking(lab_id, video_id)

    fe = FeatureExtractor(
        fps=fps,
        pix_per_cm=pix_per_cm,
        smooth_sigma=1.0,
        use_pairwise=True,
    )

    frames, mouse_ids, pos, per_mouse_df = fe.build_pose_tensor(tracking)

    # agent/target có thể là cùng chuột (self) hoặc khác chuột (pair)
    features_df: pd.DataFrame = fe.extract_agent_target(
        frames=frames,
        mouse_ids=mouse_ids,
        pos=pos,
        agent_id=agent_id,
        target_id=target_id,
        per_mouse_df=per_mouse_df,
    )
    # index chính là frame
    features_df.index = frames

    _feature_cache[key] = (frames, features_df)
    return frames, features_df

_feature_cache: Dict[Tuple[str, int, Any, Any], Tuple[np.ndarray, pd.DataFrame]] = {}

def get_frame_features_for_pair_test(
    lab_id: str,
    video_id: int,
    agent_id: Any,
    target_id: Any,
    test_meta: pd.DataFrame,
) -> Tuple[np.ndarray, pd.DataFrame]:
    """
    Feature per-frame cho test (video_id, agent, target).
    Trả về: frames [F], features_df [F, D]
    """
    key = (f"test_{lab_id}", int(video_id), agent_id, target_id)
    if key in _feature_cache:
        return _feature_cache[key]

    # Lấy fps, pix_per_cm_approx từ test.csv
    row = test_meta[test_meta["video_id"] == video_id].iloc[0]
    fps = float(row["frames_per_second"])
    pix_per_cm = float(row["pix_per_cm_approx"])
    if not np.isfinite(pix_per_cm) or pix_per_cm <= 0:
        pix_per_cm = 1.0

    tracking = load_tracking_test(lab_id, video_id)

    fe = FeatureExtractor(
        fps=fps,
        pix_per_cm=pix_per_cm,
        smooth_sigma=1.0,
        use_pairwise=True,
    )

    frames, mouse_ids, pos, per_mouse_df = fe.build_pose_tensor(tracking)

    features_df = fe.extract_agent_target(
        frames=frames,
        mouse_ids=mouse_ids,
        pos=pos,
        agent_id=agent_id,
        target_id=target_id,
        per_mouse_df=per_mouse_df,
    )
    features_df.index = frames

    _feature_cache[key] = (frames, features_df)
    return frames, features_df



# =========================================================
# 4. BUILD FRAME-LEVEL DATASET CHO 1 (lab_id, behavior)
# =========================================================

def build_frame_dataset_for_lab_behavior(
    lab_id: str,
    behavior: str,
    train_meta: pd.DataFrame,
    mode: str = "self",
) -> Tuple[pd.DataFrame, pd.DataFrame, np.ndarray]:
    """
    Xây tập frame-level (indices, features, labels) cho 1 (lab, behavior).

    indices: DataFrame với cột INDEX_COLS
    features: DataFrame per-frame features
    labels: np.ndarray nhị phân (0/1)
    """

    videos = (
        train_meta[train_meta["lab_id"] == lab_id]["video_id"]
        .unique()
        .tolist()
    )

    index_list = []
    feature_list = []
    label_list = []

    for video_id in videos:
        ann = load_annotation(lab_id, video_id)
        if ann.empty:
            continue

        # chỉ lấy annotation của behavior này
        ann_bhv = ann[ann["action"] == behavior]
        if ann_bhv.empty:
            continue

        # các (agent, target) cần xem
        pairs = ann_bhv[["agent_id", "target_id"]].drop_duplicates().values.tolist()
        for (agent_id, target_id) in pairs:
            if mode == "self":
                target_id_use = agent_id
            else:
                target_id_use = target_id

            frames, feat_df = get_frame_features_for_pair(
                lab_id=lab_id,
                video_id=video_id,
                agent_id=agent_id,
                target_id=target_id_use,
                meta=train_meta,
            )

            # label per-frame: frame ∈ bất kỳ [start, stop) của (agent,target,behavior)
            ann_pair = ann_bhv[
                (ann_bhv["agent_id"] == agent_id)
                & (ann_bhv["target_id"] == target_id)
            ]
            if ann_pair.empty and mode == "self":
                ann_pair = ann_bhv[ann_bhv["agent_id"] == agent_id]

            pos_frames = set()
            for _, r in ann_pair.iterrows():
                pos_frames.update(range(int(r["start_frame"]), int(r["stop_frame"])))

            if len(pos_frames) == 0:
                continue

            label = np.isin(frames, list(pos_frames)).astype("int8")
            if label.sum() == 0:
                continue

            idx_df = pd.DataFrame(
                {
                    "video_id": video_id,
                    "agent_id": agent_id,
                    "target_id": target_id,
                    "video_frame": frames,
                }
            )

            index_list.append(idx_df)
            feature_list.append(feat_df.reset_index(drop=True))
            label_list.append(label)

    if not index_list:
        return (
            pd.DataFrame(columns=INDEX_COLS),
            pd.DataFrame(),
            np.zeros(0, dtype="int8"),
        )

    indices = pd.concat(index_list, ignore_index=True)
    features = pd.concat(feature_list, ignore_index=True)
    labels = np.concatenate(label_list).astype("int8")

    assert len(indices) == len(features) == len(labels)

    return indices, features, labels


# =========================================================
# 5. TRAIN + OOF CHO 1 (lab_id, behavior)
# =========================================================

def tune_threshold(oof_pred: np.ndarray, y: np.ndarray) -> float:
    ths = np.arange(0.0, 1.005, 0.005)
    scores = [f1_score(y, (oof_pred >= th), zero_division=0) for th in ths]
    return float(ths[int(np.argmax(scores))])

#
def train_validate_one(
    lab_id: str,
    behavior: str,
    indices: pd.DataFrame,
    features: pd.DataFrame,
    labels: np.ndarray,
) -> float:
    """
    Train XGBoost binary cho 1 (lab, behavior) + lưu OOF prediction.
    Trả về: F1 trên toàn bộ OOF (frame-level).
    """
    result_dir = RESULTS_DIR / lab_id / behavior
    result_dir.mkdir(parents=True, exist_ok=True)

    n = len(labels)

    if n == 0 or labels.sum() == 0:
        oof_df = indices.copy()
        oof_df["fold"] = -1
        oof_df["prediction"] = 0.0
        oof_df["predicted_label"] = 0
        oof_df.to_parquet(result_dir / "oof_predictions.parquet", index=False)
        (result_dir / "f1.txt").write_text("0.0\n")
        return 0.0

    X = features.values.astype("float32")
    y = labels.astype("int8")
    groups = indices["video_id"].values

    folds = np.ones(n, dtype="int8") * -1
    oof_pred = np.zeros(n, dtype="float32")
    oof_label = np.zeros(n, dtype="int8")

    cv = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=42)

    for fold, (tr_idx, va_idx) in enumerate(cv.split(X, y, groups=groups)):
        fold_dir = result_dir / f"fold_{fold}"
        fold_dir.mkdir(parents=True, exist_ok=True)

        X_tr, y_tr = X[tr_idx], y[tr_idx]
        X_va, y_va = X[va_idx], y[va_idx]

        # scale_pos_weight
        pos = y_tr.sum()
        neg = len(y_tr) - pos
        scale_pos_weight = float(neg / pos) if pos > 0 else 1.0

        params = {
            "objective": "binary:logistic",
            "eval_metric": "logloss",
            "device": "cuda",
            "tree_method": "hist",
            "learning_rate": 0.05,
            "max_depth": 6,
            "min_child_weight": 5,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "scale_pos_weight": scale_pos_weight,
            "max_bin": 64,
            "seed": 42,
        }

        dtrain = xgb.QuantileDMatrix(
            X_tr,
            label=y_tr,
            feature_names=features.columns.tolist(),
            max_bin=64,
        )
        dvalid = xgb.DMatrix(
            X_va,
            label=y_va,
            feature_names=features.columns.tolist(),
        )

        evals_result: Dict[str, Dict[str, List[float]]] = {}

        early_stop = xgb.callback.EarlyStopping(
            rounds=10, metric_name="logloss", data_name="valid", maximize=False
        )

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=250,
            evals=[(dtrain, "train"), (dvalid, "valid")],
            callbacks=[early_stop],
            evals_result=evals_result,
            verbose_eval=False,
        )

        pred_va = model.predict(dvalid)
        th = tune_threshold(pred_va, y_va)

        folds[va_idx] = fold
        oof_pred[va_idx] = pred_va
        oof_label[va_idx] = (pred_va >= th).astype("int8")

        model.save_model(fold_dir / "model.json")
        with open(fold_dir / "threshold.txt", "w") as f:
            f.write(f"{th}\n")

    # lưu OOF
    oof_df = indices.copy()
    oof_df["fold"] = folds
    oof_df["prediction"] = oof_pred
    oof_df["predicted_label"] = oof_label
    oof_df.to_parquet(result_dir / "oof_predictions.parquet", index=False)

    f1 = f1_score(y, oof_label, zero_division=0)
    (result_dir / "f1.txt").write_text(f"{f1:.6f}\n")
    return float(f1)

def load_models_for_behavior_infer(lab_id: str, behavior: str):
    """
    Đọc các fold model + threshold cho (lab, behavior) từ RESULTS_DIR.
    Dùng cho inference (test).
    """
    base_dir = RESULTS_DIR / lab_id / behavior
    if not base_dir.exists():
        return []

    models = []
    for fold_dir in sorted(base_dir.glob("fold_*")):
        model_file = fold_dir / "model.json"
        thr_file = fold_dir / "threshold.txt"
        if not model_file.exists():
            continue

        booster = xgb.Booster()
        booster.load_model(str(model_file))

        if thr_file.exists():
            thr = float(thr_file.read_text().strip())
        else:
            thr = 0.5

        models.append((booster, thr))

    return models


# =========================================================
# 6. LOOP QUA TẤT CẢ BEHAVIORS TRONG 1 LAB
#    (train_all_labs_behaviors vẫn giữ nguyên, nhưng main
#     sẽ filter train_meta chỉ còn 1 lab)
# =========================================================

def train_all_labs_behaviors(train_meta: pd.DataFrame):
    """
    Loop qua từng lab trong train_meta (ở đây main đã filter chỉ còn 1 lab):
      - đọc annotation của tất cả video
      - lấy unique action xuất hiện trong lab đó
      - train 1 model/frame-level cho từng (lab, action)
    """
    labs = train_meta["lab_id"].unique().tolist()

    start_time = time.perf_counter()

    for lab_id in labs:
        # tập video của lab này
        videos = train_meta[train_meta["lab_id"] == lab_id]["video_id"].unique().tolist()

        # gom toàn bộ action thực sự có trong annotation của lab này
        behaviors_set = set()
        for vid in videos:
            ann = load_annotation(lab_id, vid)
            if ann.empty:
                continue
            behaviors_set.update(ann["action"].unique().tolist())

        behaviors = sorted(behaviors_set)
        print(f"\n===== LAB {lab_id}: {len(behaviors)} behaviors =====")

        for behavior in behaviors:
            # if behavior != "submit": continue

            mode = "self" if behavior in SELF_BEHAVIORS else "pair"

            print(f"\n=== LAB={lab_id} | behavior={behavior} | mode={mode} ===")
            indices, features, labels = build_frame_dataset_for_lab_behavior(
                lab_id=str(lab_id),
                behavior=behavior,
                train_meta=train_meta,
                mode=mode,
            )
            print(
                f"frames: {len(labels):,}, positives: {labels.sum():,}, features: "
                f"{features.shape[1] if not features.empty else 0}"
            )

            if len(labels) == 0:
                print(" -> skip (no samples)")
                continue

            f1 = train_validate_one(str(lab_id), behavior, indices, features, labels)
            elapsed = time.perf_counter() - start_time
            print(f" -> OOF F1 (frame-level): {f1:.3f} | elapsed={elapsed/60:.1f} min")



# =========================================================
# 7. GOM OOF PREDICTION → SEGMENT & TÍNH SCORE()
# =========================================================

def build_oof_submission_from_parquet(
    target_lab_id: Optional[str] = None,
) -> pd.DataFrame:
    """
    Đọc tất cả oof_predictions.parquet trong RESULTS_DIR,
    gom thành frame-level table rồi nối thành segment-level prediction
    giống inference notebook (simplified).

    Nếu target_lab_id != None thì chỉ lấy OOF của lab đó
    (vd "AdaptableSnail").
    """
    oof_files = list(RESULTS_DIR.glob("*/**/oof_predictions.parquet"))
    if not oof_files:
        raise RuntimeError("Không tìm thấy OOF parquet, hãy train trước.")

    frame_preds = []

    for path in oof_files:
        # path: results_xgb_fe/lab/behavior/oof_predictions.parquet
        parts = path.parts
        behavior = parts[-2]
        lab_id = parts[-3]

        # chỉ lấy file thuộc lab mong muốn (nếu có)
        if target_lab_id is not None and lab_id != target_lab_id:
            continue

        df = pd.read_parquet(path)
        df = df[INDEX_COLS + ["prediction"]].copy()
        df["lab_id"] = lab_id
        df["action"] = behavior
        frame_preds.append(df)

    if not frame_preds:
        raise RuntimeError(
            f"Không có OOF predictions nào cho lab_id={target_lab_id}"
        )

    frame_df = pd.concat(frame_preds, ignore_index=True)

    # sắp xếp
    frame_df = frame_df.sort_values(
        ["lab_id", "video_id", "agent_id", "target_id", "action", "video_frame"]
    ).reset_index(drop=True)

    # Convert frame-level prob -> hard label + segments
    segments = []
    for (lab_id, video_id, agent_id, target_id, action), group in frame_df.groupby(
        ["lab_id", "video_id", "agent_id", "target_id", "action"], sort=False
    ):
        frames = group["video_frame"].values
        scores = group["prediction"].values

        # dùng một threshold fix (vd 0.5) cho demo
        # (hoặc bạn có thể lưu threshold per (lab,behavior) và apply)
        hard = scores >= 0.5

        in_seg = False
        start = None
        prev_f = None

        for f, h in zip(frames, hard):
            if h and not in_seg:
                in_seg = True
                start = int(f)
            elif (not h) and in_seg:
                stop = int(prev_f + 1)  # [start, stop)
                segments.append(
                    {
                        "lab_id": lab_id,
                        "video_id": int(video_id),
                        "agent_id": int(agent_id),
                        "target_id": int(target_id),
                        "action": action,
                        "start_frame": start,
                        "stop_frame": stop,
                    }
                )
                in_seg = False
            prev_f = f

        if in_seg:
            stop = int(frames[-1] + 1)
            segments.append(
                {
                    "lab_id": lab_id,
                    "video_id": int(video_id),
                    "agent_id": int(agent_id),
                    "target_id": int(target_id),
                    "action": action,
                    "start_frame": start,
                    "stop_frame": stop,
                }
            )

    if not segments:
        return pd.DataFrame(
            columns=[
                "lab_id",
                "video_id",
                "agent_id",
                "target_id",
                "action",
                "start_frame",
                "stop_frame",
            ]
        )

    submission = pd.DataFrame(segments)
    submission = submission.sort_values(
        ["lab_id", "video_id", "agent_id", "target_id", "action", "start_frame"]
    ).reset_index(drop=True)

    return submission

BAD_VIDEOS = []

def compute_validation_score(
    submission: pd.DataFrame,
    lab_id: Optional[str] = None,
) -> float:
    """
    Gọi metric `score()` chính thức trên train set.
    Nếu lab_id != None → chỉ validate trên lab đó.
    """
    # ===== THAY ĐỔI Ở ĐÂY =====
    # Không dùng train.csv, mà phải đọc toàn bộ annotations
    train_meta = pd.read_csv(INPUT_DIR / "train.csv")
    
    if lab_id is not None:
        train_meta = train_meta[train_meta["lab_id"] == lab_id].reset_index(drop=True)

    if BAD_VIDEOS:
        train_meta = train_meta[~train_meta["video_id"].isin(BAD_VIDEOS)]
    
    # Đọc tất cả annotation files
    all_annotations = []
    for _, row in train_meta.iterrows():
        lab = row["lab_id"]
        vid = row["video_id"]
        ann = load_annotation(lab, vid)
        if not ann.empty:
            ann["lab_id"] = lab
            ann["video_id"] = vid
            ann["behaviors_labeled"] = row["behaviors_labeled"]
            all_annotations.append(ann)
    
    if not all_annotations:
        print("Không có annotation nào để validate!")
        return 0.0
    
    dataset = pd.concat(all_annotations, ignore_index=True)
    
    # Filter submission theo lab nếu cần
    if lab_id is not None:
        submission = submission[submission["lab_id"] == lab_id].reset_index(drop=True)
    
    # ===== GỌI METRIC =====
    s = score(dataset, submission, row_id_column_name="row_id")

    print(
        f"Official validation score"
        f"{' (lab=' + lab_id + ')' if lab_id is not None else ''}: {s:.6f}"
    )
    return float(s)



# =========================================================
# 8. MAIN
# =========================================================
def str_to_mouse_id(s: str) -> int:
    if s == "self":
        return -1
    return int(str(s).replace("mouse", ""))


def predict_behaviors_for_pair(
    lab_id: str,
    video_id: int,
    agent_internal_id: Any,
    target_internal_id: Any,
    behaviors: List[str],
    test_meta: pd.DataFrame,
) -> pd.DataFrame:
    """
    Chạy inference cho 1 cặp (video, agent_internal_id, target_internal_id)
    với list behaviors (cùng mode: all self hoặc all pair).
    Trả về segment-level DataFrame: video_id, action, start_frame, stop_frame.
    """
    if lab_id != "ElegantMink": return None
    frames, feat_df = get_frame_features_for_pair_test(
        lab_id=lab_id,
        video_id=video_id,
        agent_id=agent_internal_id,
        target_id=target_internal_id,
        test_meta=test_meta,
    )
    if feat_df.empty:
        return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])

    feat_df = feat_df.astype("float32")
    n_frames = len(feat_df)

    scores_per_behavior = {}
    for behavior in behaviors:
        models = load_models_for_behavior_infer(lab_id, behavior)
        if not models:
            continue

        req_feats = models[0][0].feature_names
        # Build X_test với đúng bộ feature của model
        X_test = pd.DataFrame(
            0.0,
            index=feat_df.index,
            columns=req_feats,
            dtype=np.float32,
        )
        common = list(set(req_feats) & set(feat_df.columns))
        if common:
            X_test[common] = feat_df[common]

        dtest = xgb.DMatrix(X_test, feature_names=req_feats)

        agg_scores = np.zeros(n_frames, dtype=np.float32)
        for booster, thr in models:
            probs = booster.predict(dtest)
            labels = (probs >= thr).astype(np.int8)
            agg_scores += probs * labels

        agg_scores /= max(len(models), 1)
        scores_per_behavior[behavior] = agg_scores

        del dtest, X_test
        gc.collect()

    if not scores_per_behavior:
        return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])

    beh_list = list(scores_per_behavior.keys())
    score_mat = np.vstack([scores_per_behavior[b] for b in beh_list]).T  # [F, B]

    max_idx = score_mat.argmax(axis=1)
    max_scores = score_mat.max(axis=1)
    labels = np.where(max_scores == 0.0, "none", np.array(beh_list)[max_idx])

    # frame-level → segment
    segments = []
    prev_lab = "none"
    prev_start = None
    prev_f = None

    for f, lab in zip(frames, labels):
        if lab != prev_lab:
            if prev_lab != "none":
                segments.append(
                    {
                        "video_id": int(video_id),
                        "action": prev_lab,
                        "start_frame": int(prev_start),
                        "stop_frame": int(prev_f + 1),
                    }
                )
            prev_lab = lab
            prev_start = f
        prev_f = f

    if prev_lab != "none":
        segments.append(
            {
                "video_id": int(video_id),
                "action": prev_lab,
                "start_frame": int(prev_start),
                "stop_frame": int(prev_f + 1),
            }
        )

    if not segments:
        return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])

    return pd.DataFrame(segments)



target_lab = "ElegantMink"

print(f"Đọc test.csv cho lab {target_lab} ...")
test_meta = pd.read_csv(INPUT_DIR / "test.csv")
test_meta = test_meta[test_meta["lab_id"] == target_lab].reset_index(drop=True)

# Lấy danh sách behavior đã train (thư mục con trong RESULTS_DIR/AdaptableSnail)
lab_result_dir = RESULTS_DIR / target_lab
if lab_result_dir.exists():
    trained_behaviors = sorted(
        [p.name for p in lab_result_dir.iterdir() if p.is_dir()]
    )
else:
    trained_behaviors = []

self_behaviors_in_lab = [b for b in trained_behaviors if b in SELF_BEHAVIORS]
pair_behaviors_in_lab = [b for b in trained_behaviors if b in PAIR_BEHAVIORS]

print("Behaviors (self) dùng để predict:", self_behaviors_in_lab)
print("Behaviors (pair) dùng để predict:", pair_behaviors_in_lab)

all_segments = []

# Loop từng video test của lab
for video_id in sorted(test_meta["video_id"].unique()):
    print(f"Predict video_id={video_id} ...")

    tracking = load_tracking_test(target_lab, video_id)
    mouse_ids_internal = sorted(tracking["mouse_id"].unique().tolist())

    # Map internal mouse_id -> string để đưa vào submission
    def to_submit_id(mid):
        s = str(mid)
        return s if s.startswith("mouse") else f"mouse{s}"

    # SELF behaviors: agent == target (self)
    if self_behaviors_in_lab:
        for mid in mouse_ids_internal:
            seg_df = predict_behaviors_for_pair(
                lab_id=target_lab,
                video_id=video_id,
                agent_internal_id=mid,
                target_internal_id=mid,  # self
                behaviors=self_behaviors_in_lab,
                test_meta=test_meta,
            )
            if not seg_df.empty:
                seg_df["agent_id"] = to_submit_id(mid)
                seg_df["target_id"] = "self"
                all_segments.append(seg_df)

    # PAIR behaviors: mọi cặp agent != target
    if pair_behaviors_in_lab and len(mouse_ids_internal) > 1:
        for agent_internal, target_internal in itertools.permutations(
            mouse_ids_internal, 2
        ):
            seg_df = predict_behaviors_for_pair(
                lab_id=target_lab,
                video_id=video_id,
                agent_internal_id=agent_internal,
                target_internal_id=target_internal,
                behaviors=pair_behaviors_in_lab,
                test_meta=test_meta,
            )
            if not seg_df.empty:
                seg_df["agent_id"] = to_submit_id(agent_internal)
                seg_df["target_id"] = to_submit_id(target_internal)
                all_segments.append(seg_df)

# Gộp tất cả segments → submission.csv
if all_segments:
    submission3 = pd.concat(all_segments, ignore_index=True)
    submission3 = submission3[
        ["video_id", "agent_id", "target_id", "action", "start_frame", "stop_frame"]
    ]
    submission3 = submission3.sort_values(
        ["video_id", "agent_id", "target_id", "action", "start_frame"]
    ).reset_index(drop=True)
else:
    # DataFrame rỗng, KHÔNG dummy row
    submission3 = pd.DataFrame(
        columns=[
            "video_id",
            "agent_id",
            "target_id",
            "action",
            "start_frame",
            "stop_frame",
        ]
    )

# Thêm row_id (kể cả khi rỗng)
submission3.insert(0, "row_id", np.arange(len(submission3), dtype=np.int64))

sub_path = WORKING_DIR / "submission3.csv"
submission3.to_csv(sub_path, index=False)
print(f"Saved ElegantMink submission to {sub_path}")


Đọc test.csv cho lab ElegantMink ...
Behaviors (self) dùng để predict: []
Behaviors (pair) dùng để predict: ['allogroom', 'attack', 'attemptmount', 'ejaculate', 'intromit', 'mount', 'sniff']
Saved ElegantMink submission to /kaggle/working/submission3.csv


# GroovyShrew

In [6]:
from pathlib import Path
import shutil
import gc

WORKING_DIR = Path("/kaggle/working")

# 1) Xóa mọi thứ trong /kaggle/working trừ .csv
for path in WORKING_DIR.iterdir():
    # giữ lại file .csv
    if path.is_file() and path.suffix == ".csv":
        continue

    if path.is_file():
        try:
            path.unlink()
        except Exception as e:
            print(f"Cannot remove file {path}: {e}")
    elif path.is_dir():
        try:
            shutil.rmtree(path, ignore_errors=True)
        except Exception as e:
            print(f"Cannot remove dir {path}: {e}")


gc.collect()

76

In [7]:
from __future__ import annotations
from typing import Dict, List, Tuple, Any, Optional
import warnings
from dataclasses import dataclass, field
import numpy as np
import pandas as pd
from scipy.ndimage import gaussian_filter1d
from tqdm import tqdm

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
np.seterr(invalid="ignore", divide="ignore")

# =============================================================================
# 1. CONFIGURATION
# =============================================================================
@dataclass
class FeatureConfig:
    """
    Chứa cấu hình tham số (Hyperparameters).
    """
    fps: float = 30.0
    pix_per_cm: float = 1.0
    smooth_sigma: float = 1.0
    use_pairwise: bool = True


# =============================================================================
# 2. AGENT CONTEXT
# =============================================================================
@dataclass
class AgentContext:
    """
    Container chứa dữ liệu đã tiền xử lý của một con chuột.
    Giúp tránh việc tính toán lại vận tốc/gia tốc nhiều lần.
    """
    idx: pd.Index          # Index frame
    pos: np.ndarray        # [F, 2] cm
    vel: np.ndarray        # [F, 2] cm/s
    speed: np.ndarray      # [F, 1] cm/s
    acc: np.ndarray        # [F, 2] cm/s^2
    
    cx: pd.Series          # Series tọa độ X (để dùng rolling)
    cy: pd.Series          # Series tọa độ Y
    speed_series: pd.Series # Series tốc độ
    
    raw_df: Optional[pd.DataFrame] = None # Dữ liệu gốc các bộ phận 


# =============================================================================
# 3. FEATURE EXTRACTOR
# =============================================================================
class FeatureExtractor:
    """
    Class trích xuất đặc trưng hành vi từ dữ liệu tracking.
    """
    def __init__(self, fps: float, pix_per_cm: float, smooth_sigma: float = 1.0, use_pairwise: bool = True):
        # Map tham số từ init vào Config
        self.cfg = FeatureConfig(
            fps=float(fps), 
            pix_per_cm=float(pix_per_cm), 
            smooth_sigma=smooth_sigma,
            use_pairwise=use_pairwise
        )
        
        # Đăng ký các hàm feature sẽ chạy
        self.feature_registry = {
            "kinematics": self._feat_basic_kinematics,
            "multiscale": self._feat_multiscale,
            "long_range": self._feat_long_range,
            "cumulative": self._feat_cumulative,
            "curvature": self._feat_curvature,
            "speed_asym": self._feat_speed_asym,
            "gauss_shift": self._feat_gauss_shift,
            "pose_shape": self._feat_pose_shape,
            "pairwise": self._feat_pairwise,
            "follow": self._feat_follow_pattern,
            "short": self._feat_shortburst_social,
            "a": self._feat_attack_sniff,
            "b": self._feat_climb
        }

    # --- Helpers ---
    def _scale(self, n_frames_30fps: int) -> int:
        """Quy đổi số frame từ chuẩn 30fps sang fps thực tế của video."""
        return max(1, int(round(n_frames_30fps * self.cfg.fps / 30.0)))

    def _to_cm(self, arr):
        """Chuyển pixel -> cm."""
        return arr / self.cfg.pix_per_cm

    def _smooth(self, x):
        """Làm mượt dữ liệu bằng Gaussian filter."""
        if self.cfg.smooth_sigma is None or x.shape[0] < 3: return x
        if np.all(np.isnan(x)): return x
        return gaussian_filter1d(x, sigma=self.cfg.smooth_sigma, axis=0, mode="nearest")

    def _forward_fill_nan(self, pos):
        """
        Điền dữ liệu thiếu (NaN) bằng giá trị hợp lệ trước đó (Forward Fill).
        """
        if np.all(np.isnan(pos)):
            return np.zeros_like(pos)

        pos_ffill = pos.copy()
        mask = np.any(~np.isnan(pos_ffill), axis=1)
        if not mask.any():
            return np.zeros_like(pos_ffill)

        valid_idx = np.where(mask)[0]
        first, last = valid_idx[0], valid_idx[-1]
        pos_ffill[:first] = pos_ffill[first]
        pos_ffill[last + 1:] = pos_ffill[last]
        df_temp = pd.DataFrame(pos_ffill)
        df_temp = df_temp.ffill()
        return df_temp.to_numpy()
    
    def _speed_series(self, cx: pd.Series, cy: pd.Series) -> pd.Series:
        dx = cx.diff()
        dy = cy.diff()
        v = np.hypot(dx, dy).fillna(0.0) * self.cfg.fps
        return v.astype("float32")
    
    def _roll_future_mean(self, s: pd.Series, w: int, min_p: int = 1) -> pd.Series:
        return s.iloc[::-1].rolling(w, min_periods=min_p).mean().iloc[::-1]

    def _roll_future_var(self, s: pd.Series, w: int, min_p: int = 2) -> pd.Series:
        return s.iloc[::-1].rolling(w, min_periods=min_p).var().iloc[::-1]

    # --- Core Logic ---
    def _compute_kinematics(self, pos_px: np.ndarray):
        """
        Tính toán vật lý cơ bản: Pos(cm), Vel, Speed, Acc.
        Input: Array [Frames, 2] (pixel).
        Output: Tuple (pos_cm, vel, speed, acc).
        """
        pos_ffill = self._forward_fill_nan(pos_px)
        pos_cm = self._to_cm(pos_ffill.astype(np.float32))
        pos_cm = self._smooth(pos_cm)                                               # [F, 2]

        dt = 1.0 / self.cfg.fps
        vel = np.zeros_like(pos_cm, dtype=np.float32)
        vel[1:] = (pos_cm[1:] - pos_cm[:-1]) / dt                                   # [F, 2: (vx, vy)]
        speed = np.linalg.norm(vel, axis=1, keepdims=True).astype(np.float32)       # [F, 1]

        acc = np.zeros_like(pos_cm, dtype=np.float32)                          
        acc[1:] = (vel[1:] - vel[:-1]) / dt                                         # [F, 2:(ax, ay)]
        return pos_cm.astype(np.float32), vel, speed, acc

    def _build_context(self, frames, pos_px, mouse_df=None) -> AgentContext:
        """
        Tạo AgentContext chứa đầy đủ thông tin vật lý của 1 con chuột.
        """
        p, v, s, a = self._compute_kinematics(pos_px)
        idx = pd.Index(frames, name="frame")
        
        return AgentContext(
            idx=idx, pos=p, vel=v, speed=s, acc=a, 
            cx=pd.Series(p[:, 0], index=idx), 
            cy=pd.Series(p[:, 1], index=idx), 
            speed_series=pd.Series(s[:, 0], index=idx), 
            raw_df=mouse_df
        )

    # --- Feature Modules ---
    def _feat_basic_kinematics(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Lấy các giá trị thô: tọa độ x, y, vận tốc vx, vy, tốc độ, gia tốc ax, ay.
        """
        return {
            "a_x": ctx.pos[:, 0], "a_y": ctx.pos[:, 1],
            "a_vx": ctx.vel[:, 0], "a_vy": ctx.vel[:, 1],
            "a_speed": ctx.speed[:, 0],
            "a_ax": ctx.acc[:, 0], "a_ay": ctx.acc[:, 1]
        }

    def _feat_multiscale(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Tính tốc độ trung bình (Mean) và độ lệch chuẩn (Std) ở đa mức thời gian.
        Feature 'sp_ratio' đo độ bùng nổ (Burstiness).
        """
        feats = {}
        speed = ctx.speed_series
        frame_scales = [10, 40, 160]
        for scale in frame_scales:
            ws = self._scale(scale)
            if len(speed) >= ws:
                roller = speed.rolling(ws, min_periods=max(1, ws//4), center=True)
                feats[f"sp_m{scale}"] = roller.mean().astype("float32")
                feats[f"sp_s{scale}"] = roller.std().astype("float32")
        feats[f"sp_ratio"] = feats["sp_m10"] / (feats["sp_m160"] + 1e-6)
        return feats 
        
    def _feat_long_range(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Đặc trưng ngữ cảnh dài hạn:
        - x_ml, y_ml: Vị trí trung bình trong quá khứ.
        - sp_pct: Xếp hạng (percentile) của tốc độ hiện tại so với quá khứ.
        """
        feats: Dict[str, pd.Series] = {}
        speed = ctx.speed_series

        for window in [120, 240]:
            ws = self._scale(window)
            if len(ctx.cx) >= ws:
                feats[f"x_ml{window}"] = ctx.cx.rolling(ws, min_periods=max(5, ws // 6), center=True).mean()
                feats[f"y_ml{window}"] = ctx.cy.rolling(ws, min_periods=max(5, ws // 6), center=True).mean()

        for span in [60, 120]:
            s = self._scale(span)
            feats[f"x_e{span}"] = ctx.cx.ewm(span=s, min_periods=1).mean()
            feats[f"y_e{span}"] = ctx.cy.ewm(span=s, min_periods=1).mean()

        for window in [60, 120]:
            ws = self._scale(window)
            if len(speed) >= ws:
                feats[f"sp_pct{window}"] = speed.rolling(
                    ws, min_periods=max(5, ws // 6), center=True
                ).rank(pct=True)
        return feats
    

    def _feat_curvature(self, ctx: AgentContext, **kwargs) -> Dict:
        feats = {}

        vel_x, vel_y = ctx.vel[:, 0], ctx.vel[:, 1]
        acc_x, acc_y = ctx.acc[:, 0], ctx.acc[:, 1]
        cross_prod = vel_x * acc_y - vel_y * acc_x
        vel_mag = np.sqrt(vel_x**2 + vel_y**2)
        moving_mask = vel_mag > 2.0
        vel_mag_safe = np.maximum(vel_mag, 0.1 / self.cfg.fps)
        raw_curv = cross_prod / (vel_mag_safe**3)
        raw_curv = np.where(moving_mask, raw_curv, 0.0)
        min_turn_radius_cm = 0.5
        max_k = 1.0 / min_turn_radius_cm
        raw_curv = np.clip(raw_curv, -max_k, max_k)
        abs_curv = np.abs(raw_curv)
        abs_curv_series = pd.Series(abs_curv, index=ctx.idx)

        for w in [30, 60]:
            ws = self._scale(w)
            min_p = max(ws // 3, 1)
            feats[f"curv_mean_{w}"] = abs_curv_series.rolling(ws, min_periods=min_p).mean()

        angle = np.arctan2(vel_y, vel_x)
        angle_series = pd.Series(angle, index=ctx.idx)
        angle_change = np.abs(angle_series.diff().fillna(0.0))
        angle_change = np.where(angle_change > np.pi, 2 * np.pi - angle_change, angle_change)
        angle_change_series = pd.Series(angle_change, index=ctx.idx)
        angle_change_series = pd.Series(np.where(moving_mask, angle_change_series, 0.0), index=ctx.idx)

        ws = self._scale(30)
        feats["turn_rate_30"] = angle_change_series.rolling(ws, min_periods=max(ws // 3, 1)).sum()

        return feats
    
    def _feat_cumulative(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Tổng quãng đường di chuyển trong một khoảng thời gian dài xung quanh frame hiện tại.
        """
        feats = {}
        L = max(1, self._scale(180))
        step = np.hypot(ctx.cx.diff(), ctx.cy.diff()).fillna(0.0)
        path = step.rolling(2 * L + 1, min_periods=max(5, L // 6), center=True).sum()
        feats["path_cum180"] =  path.fillna(0.0).astype("float32")
        return feats

    def _feat_speed_asym(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Bất đối xứng tốc độ (Tương lai - Quá khứ).
        """
        w = max(3, self._scale(30))
        v = ctx.speed_series
        v_past = v.rolling(w, min_periods=1).mean()
        v_fut = self._roll_future_mean(v, w, min_p=1)
        return {"spd_asym_1s": (v_fut - v_past).fillna(0.0)}
    
    def _feat_gauss_shift(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Độ lệch Gaussian (KL Divergence) giữa quá khứ và tương lai.
        Đo lường sự thay đổi trạng thái thống kê.
        """
        w = max(5, self._scale(30))
        v = ctx.speed_series
        mu_p = v.rolling(w, min_periods=1).mean()
        va_p = v.rolling(w, min_periods=1).var().clip(lower=1e-6)
        mu_f = self._roll_future_mean(v, w, min_p=1)
        va_f = self._roll_future_var(v, w, min_p=1).clip(lower=1e-6)

        kl_pf = 0.5 * (
            (va_p / va_f) + ((mu_f - mu_p) ** 2) / va_f - 1.0 + np.log(va_f / va_p)
        )
        kl_fp = 0.5 * (
            (va_f / va_p) + ((mu_p - mu_f) ** 2) / va_p - 1.0 + np.log(va_p / va_f)
        )
        return {
            "spd_symkl_1s": (kl_pf + kl_fp).replace([np.inf, -np.inf], np.nan).fillna(0.0)
        }
    
    def _extract_part(self, ctx: AgentContext, part: str) -> Optional[np.ndarray]:
        if ctx.raw_df is None: return None
        if part not in ctx.raw_df.columns.get_level_values(0): return None
        try:
            sub_df = ctx.raw_df.xs(part, axis=1, level=0)[["x", "y"]].reindex(ctx.idx)
        except KeyError: return None
        raw = sub_df.to_numpy()
        raw = self._forward_fill_nan(raw)
        cm = self._to_cm(raw.astype(np.float32))
        return self._smooth(cm)
    
    def _extract_parts_dict(self, ctx: AgentContext, parts: List[str] = None) -> Dict[str, Optional[np.ndarray]]:
        out = {}
        for p in parts:
            out[p] = self._extract_part(ctx, p)
        return out
        
    def _feat_pose_shape(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Placeholder cho các đặc trưng hình dáng (Elongation, Body Angle...).
        """
        feats = {}

        def zero(): return pd.Series(0.0, index=ctx.idx, dtype="float32")

        def dist(k1, k2):
            p1, p2 = parts.get(k1), parts.get(k2)
            if p1 is None or p2 is None: return zero()
            d = np.linalg.norm(p1 - p2, axis=1)
            return pd.Series(d, index=ctx.idx, dtype="float32")
        
        def elongation():
            if parts.get("nose")          is None: return zero()
            if parts.get("tail_base")     is None: return zero()
            if parts.get("ear_left")  is None: return zero()
            if parts.get("ear_right") is None: return zero()

            d1 = dist("nose", "tail_base")
            d2 = dist("ear_left", "ear_right")
            elongation = d1 / (d2 + 1e-6).astype("float32")
            return elongation
        
        def part_speed(part: str, n_frames_30fps: int) -> Dict:
            part_pos = self._extract_part(ctx, part)
            if part_pos is None: return zero()
            
            s_x = pd.Series(part_pos[:, 0], index=ctx.idx)
            s_y = pd.Series(part_pos[:, 1], index=ctx.idx)
            raw_speed = self._speed_series(s_x, s_y)

            ws = self._scale(n_frames_30fps)
            val = raw_speed.rolling(ws, min_periods=1, center=True).mean()
            return val.astype("float32")


        target_parts = ["head", "ear_left", "ear_right", "tail_base"]
        
        parts = self._extract_parts_dict(ctx, target_parts)

        feats["aa_head_tailbase_dist"]       = dist("head", "tail_base")
        feats["aa_earleft_tailbase_dist"]    = dist("ear_left", "tail_base")
        feats["aa_earright_tailbase_dist"]   = dist("ear_right", "tail_base")
        feats["aa_head_earleft_dist"]        = dist("ear_left", "head")
        feats["aa_head_ear_right_dist"]      = dist("ear_right", "head")
        
        feats["a_elongation"]                = elongation()
        feats["a_tail_base_vel_500ms"]       = part_speed("tail_base", 15)
        feats["a_tail_base_vel_1000ms"]      = part_speed("tail_base", 30)
        feats["a_tail_base_vel_2000ms"]      = part_speed("tail_base", 60)
        feats["a_tail_base_vel_3000ms"]      = part_speed("tail_base", 90)
        feats["a_head_vel_500ms"]            = part_speed("head", 15)
        feats["a_head_vel_1000ms"]           = part_speed("head", 30)
        feats["a_head_vel_2000ms"]           = part_speed("head", 60)
        feats["a_head_vel_3000ms"]           = part_speed("head", 90)

        feats["a_ear_right_vel_500ms"]       = part_speed("ear_right", 15)
        feats["a_ear_right_vel_1000ms"]      = part_speed("ear_right", 30)
        feats["a_ear_right_vel_2000ms"]      = part_speed("ear_right", 60)
        feats["a_ear_right_vel_3000ms"]       = part_speed("ear_right", 90)
        feats["a_ear_left_vel_500ms"]        = part_speed("ear_left", 15)
        feats["a_ear_left_vel_1000ms"]       = part_speed("ear_left", 30)
        feats["a_ear_left_vel_2000ms"]       = part_speed("ear_left", 60)
        feats["a_ear_left_vel_3000ms"]       = part_speed("ear_left", 90)
        
        return feats

    def _feat_attack_sniff(
        self,
        ctx: AgentContext,
        target_ctx: AgentContext = None,
        **kwargs
    ) -> Dict[str, pd.Series]:
        """
        Đặc trưng phân biệt attack vs sniff cho lab 2-mouse (agent=1, target=2).
    
        Ý tưởng:
          - attack: speed 2 con biến động mạnh, đổi hướng nhiều, body overlap cao.
          - sniff : mũi gần cổ/thân, overlap thấp hơn, motion nhẹ/ổn định hơn.
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
    
        def zero():
            return pd.Series(0.0, index=idx, dtype="float32")

        # helper khoảng cách
        def dist(p1, p2):
            if p1 is None or p2 is None:
                return zero()
            d = np.linalg.norm(p1 - p2, axis=1)
            return pd.Series(d, index=idx, dtype="float32")

        parts_a = self._extract_parts_dict(ctx, ["head", "tail_base"])
        parts_t = self._extract_parts_dict(target_ctx, ["head", "tail_base"])
    
        # ---------------------------------------------------------
        # 2) ĐIỂM ĐẠI DIỆN THÂN (BODY CENTER) CHO MỖI CON
        #    dùng trung bình neck – hips – tail_base
        # ---------------------------------------------------------
    
        # ---------------------------------------------------------
        # 4) MỨC ĐỘ “BẠO LỰC”: DAO ĐỘNG TỐC ĐỘ & ĐỔI HƯỚNG
        # ---------------------------------------------------------
        # speed 2 con từ velocity
        a_speed = pd.Series(
            np.linalg.norm(ctx.vel, axis=1),
            index=idx,
            dtype="float32",
        )
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32",
        )

        ws_05 = self._scale(15)  # ~0.5s
        mp_05 = max(ws_05 // 3, 1)
    
        feats["as_a_speed_std_05"] = (
            a_speed.rolling(ws_05, min_periods=mp_05).std().fillna(0.0).astype("float32")
        )
        feats["as_t_speed_std_05"] = (
            t_speed.rolling(ws_05, min_periods=mp_05).std().fillna(0.0).astype("float32")
        )
        feats["as_speed_std_sum_05"] = (
            feats["as_a_speed_std_05"] + feats["as_t_speed_std_05"]
        )
    
        # Đổi hướng (jerk góc) của agent
        a_angle = np.arctan2(ctx.vel[:, 1], ctx.vel[:, 0])
        a_angle_diff = np.abs(np.diff(a_angle))
        a_angle_diff = np.where(
            a_angle_diff > np.pi, 2 * np.pi - a_angle_diff, a_angle_diff
        )
        a_angle_diff = np.concatenate([[0.0], a_angle_diff])
        a_angle_diff_s = pd.Series(a_angle_diff, index=idx, dtype="float32")
    
        feats["as_a_turn_jerk_05"] = (
            a_angle_diff_s.rolling(ws_05, min_periods=mp_05)
            .sum()
            .fillna(0.0)
            .astype("float32")
        )

        # ---------------------------------------------------------
        # 5) XẤP XỈ OVERLAP CƠ THỂ (BODY OVERLAP)
        #    dùng bbox từ các bộ phận thân
        # ---------------------------------------------------------
        def build_bbox(parts: Dict[str, Optional[np.ndarray]]):
            arrs = []
            for k in ["head", "ear_left", "ear_right", "tail_base"]:
                if parts.get(k) is not None:
                    arrs.append(parts[k])
            if not arrs:
                return None
            stack = np.stack(arrs, axis=1)  # [F, K, 2]
            xs = stack[:, :, 0]
            ys = stack[:, :, 1]
            xmin = np.nanmin(xs, axis=1)
            xmax = np.nanmax(xs, axis=1)
            ymin = np.nanmin(ys, axis=1)
            ymax = np.nanmax(ys, axis=1)
            return np.stack([xmin, ymin, xmax, ymax], axis=1).astype("float32")
    
        def iou_box(box1: np.ndarray, box2: np.ndarray):
            # box: [F, 4] = (xmin, ymin, xmax, ymax)
            x1 = np.maximum(box1[:, 0], box2[:, 0])
            y1 = np.maximum(box1[:, 1], box2[:, 1])
            x2 = np.minimum(box1[:, 2], box2[:, 2])
            y2 = np.minimum(box1[:, 3], box2[:, 3])
    
            inter_w = np.clip(x2 - x1, 0.0, None)
            inter_h = np.clip(y2 - y1, 0.0, None)
            inter = inter_w * inter_h
    
            area1 = (box1[:, 2] - box1[:, 0]) * (box1[:, 3] - box1[:, 1])
            area2 = (box2[:, 2] - box2[:, 0]) * (box2[:, 3] - box2[:, 1])
            union = area1 + area2 - inter + 1e-6
            iou = inter / union
            return iou.astype("float32")

        bbox_a = build_bbox(parts_a)
        bbox_t = build_bbox(parts_t)
        if bbox_a is not None and bbox_t is not None:
            iou = iou_box(bbox_a, bbox_t)
            iou_s = pd.Series(iou, index=idx, dtype="float32")
    
            feats["as_body_iou"] = iou_s
    
            ws_1s = self._scale(30)
            mp_1s = max(ws_1s // 3, 1)
            feats["as_body_iou_mean_1s"] = (
                iou_s.rolling(ws_1s, min_periods=mp_1s).mean().fillna(0.0).astype("float32")
            )
        else:
            feats["as_body_iou"] = zero()
            feats["as_body_iou_mean_1s"] = zero()
    
        # ---------------------------------------------------------
        # 6) DỌN NẠN NaN / Inf
        # ---------------------------------------------------------
        for k, v in feats.items():
            feats[k] = (
                v.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats

    def _feat_climb(self, ctx: AgentContext, **kwargs) -> Dict[str, pd.Series]:
        """
        Feature chuyên cho hành vi climb trong arena hình chữ nhật (33 x 19 cm).
    
        Ý tưởng:
          - Chuột đi gần tường: dist_wall giảm nhanh.
          - Khi climb: sát tường (dist_wall nhỏ), v_normal ~ 0,
            nhưng vẫn có v_tangent (bò ngang trên tường / di chuyển dọc biên).
        """
        feats: Dict[str, pd.Series] = {}
        idx = ctx.idx
    
        def zero() -> pd.Series:
            return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- 1. Arena size (cm) ---
        # Nếu bạn đã set trong FeatureConfig thì dùng:
        # W = self.cfg.arena_width_cm or 33.0
        # H = self.cfg.arena_height_cm or 19.0
        # Ở đây fix luôn cho lab này:
        W = 33.0
        H = 19.0
        parts = self._extract_parts_dict(ctx, ["head"])
        head = parts.get("head")
        
        if head is not None:
            # head đã ở đơn vị cm (vì _extract_part đã to_cm + smooth)
            cx = pd.Series(head[:, 0], index=idx)
            cy = pd.Series(head[:, 1], index=idx)
        else:
            # fallback: nếu không có head thì dùng body_center như cũ
            cx = ctx.cx
            cy = ctx.cy


        # # --- 2. Khoảng cách tới 4 bức tường ---
        # cx = ctx.cx  # Series
        # cy = ctx.cy  # Series
    
        dist_left   = cx - 0.0
        dist_right  = W - cx
        dist_bottom = cy - 0.0
        dist_top    = H - cy
    
        d_all = np.stack(
            [dist_left.values, dist_right.values, dist_bottom.values, dist_top.values],
            axis=1,  # [F, 4]
        )
    
        dist_wall = np.min(d_all, axis=1)          # khoảng cách tới tường gần nhất
        wall_idx  = np.argmin(d_all, axis=1)       # 0:left, 1:right, 2:bottom, 3:top
    
        dist_wall_s = pd.Series(dist_wall, index=idx, dtype="float32")
        feats["climb_dist_wall"] = dist_wall_s
    
        # --- 3. Vận tốc theo NORMAL & TANGENT của tường gần nhất ---
        vx = ctx.vel[:, 0]
        vy = ctx.vel[:, 1]
    
        # normal hướng VÀO trong arena từ tường
        nx = np.zeros_like(vx, dtype="float32")
        ny = np.zeros_like(vy, dtype="float32")

        # left  wall (x=0)    → normal = (+1, 0)
        # right wall (x=W)    → normal = (-1, 0)
        # bottom wall (y=0)   → normal = (0, +1)
        # top wall (y=H)      → normal = (0, -1)
        nx[wall_idx == 0] =  1.0
        nx[wall_idx == 1] = -1.0
        ny[wall_idx == 2] =  1.0
        ny[wall_idx == 3] = -1.0
    
        # v_normal = v ⋅ n
        v_normal = vx * nx + vy * ny
    
        # thành phần song song tường: v_tan = v - (v⋅n)n
        v_proj_x = v_normal * nx
        v_proj_y = v_normal * ny
        v_tan_x = vx - v_proj_x
        v_tan_y = vy - v_proj_y
        v_tangent = np.sqrt(v_tan_x ** 2 + v_tan_y ** 2)
    
        v_normal_s  = pd.Series(v_normal,  index=idx, dtype="float32")
        v_tangent_s = pd.Series(v_tangent, index=idx, dtype="float32")
    
        feats["climb_normal_vel"]  = v_normal_s
        feats["climb_tangent_vel"] = v_tangent_s
    
        # --- 4. Approach speed: dist_wall giảm mạnh (lao vào tường) ---
        ws = self._scale(15)  # ~0.5s (15 frame ở 30fps)
        min_p = max(ws // 3, 1)

        # diff_dw > 0 khi dist_wall giảm (đi về phía tường)
        diff_dw = -dist_wall_s.diff().fillna(0.0)  # dấu trừ để "giảm" → dương
        approach = diff_dw.rolling(ws, min_periods=min_p).mean()
        feats["climb_approach_speed_wall"] = approach.astype("float32")
    
        # --- 5. Stick score: sát tường + không còn lao vào (v_normal nhỏ) ---
        # gần tường
        thr_cm = 3.0  # tuỳ chỉnh (3cm sát tường)
        near_wall = (dist_wall_s < thr_cm).astype("float32")
    
        # ít lao vào nữa: |v_normal| nhỏ
        stick = near_wall * (1.0 / (1.0 + v_normal_s.abs()))

        # Nếu muốn climb thực sự có chút chuyển động dọc tường:
        # yêu cầu v_tangent > một ngưỡng nhỏ (ví dụ 0.5 cm/s)
        stick = stick * (v_tangent_s > 0.5).astype("float32")
    
        feats["climb_wall_stick_score"] = stick.astype("float32")
    
        # --- 6. Clean NaN/Inf ---
        for k, v in feats.items():
            feats[k] = (
                v.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats


    def _feat_pairwise(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict:
        """
        Đặc trưng tương tác cặp đôi (Pairwise): Khoảng cách, Tốc độ tiếp cận.
        """
        feats = {}
        if target_ctx is None: 
            return feats

        def zero(): return pd.Series(0.0, index=ctx.idx, dtype="float32")

        def dist_ab(pt_a, pt_b):
            if pt_a is None or pt_b is None: return zero()
            d = np.linalg.norm(pt_a - pt_b, axis=1)
            return pd.Series(d, index=ctx.idx, dtype="float32")

        rel_vec = target_ctx.pos - ctx.pos
        dist = np.linalg.norm(rel_vec, axis=1)
        feats["rel_dist"] = pd.Series(dist, index=ctx.idx, dtype="float32")

        # Khoảng cách
        my_parts = self._extract_parts_dict(ctx, ["head"])
        target_parts = self._extract_parts_dict(target_ctx, ["head", "tail_base", "ear_left", "ear_right"])

        ah, th = my_parts["head"], target_parts["head"]
        feats["dist_head_head"] = dist_ab(ah, th)
        feats["dist_head_tail"] = dist_ab(ah, target_parts["tail_base"])
        feats["dist_head_el"]   = dist_ab(ah, target_parts["ear_left"])
        feats["dist_head_er"]   = dist_ab(ah, target_parts["ear_right"])

        #  Hướng - góc nhìn
        def get_body_vec(parts_dict):
            head = parts_dict.get("head")
            tail = parts_dict.get("tail_base")
            if head is not None and tail is not None:
                return head - tail
            return None

        a_vec = get_body_vec(my_parts)
        t_vec = get_body_vec(target_parts)

        if a_vec is not None and t_vec is not None:
            dot = np.sum(a_vec * t_vec, axis=1)
            mags = np.linalg.norm(a_vec, axis=1) * np.linalg.norm(t_vec, axis=1)
            feats["body_cosine"] = pd.Series(
                np.clip(dot / (mags + 1e-6), -1.0, 1.0), index=idx, dtype="float32"
            )
        else:
            feats["body_cosine"] = zero()

        # Vector ánh nhìn = Target_Pos - My_Pos = rel_vec
        if a_vec is not None:
            dot_gaze = np.sum(a_vec * rel_vec, axis=1)
            mag_a = np.linalg.norm(a_vec, axis=1)
            feats["gaze_cosine"] = pd.Series(
                np.clip(dot_gaze / (mag_a * dist + 1e-6), -1.0, 1.0),
                index=ctx.idx, dtype="float32"
            )
        else:
            feats["gaze_cosine"] = zero()

        # Vector đơn vị hướng về địch (u)
        dist_safe = dist.copy()
        dist_safe[dist_safe == 0] = 1e-6
        u_vec = rel_vec / dist_safe[:, None]

        # a_vel và t_vel lấy từ Context
        a_vel, t_vel = ctx.vel, target_ctx.vel

        # A. Approach Speed (Vận tốc dọc trục nối 2 con)
        # Dương: Lao vào nhau | Âm: Chạy ra xa nhau
        a_along = np.sum(a_vel * u_vec, axis=1)
        t_along = np.sum(t_vel * (-u_vec), axis=1) # Target hướng ngược lại
        rel_along = np.sum((a_vel - t_vel) * u_vec, axis=1)

        # B. Lateral Speed (Vận tốc ngang - Vuông góc trục nối)
        # Vector chiếu: v_proj = (v . u) * u
        a_proj = a_along[:, None] * u_vec
        a_lat_vec = a_vel - a_proj
        a_lat_speed = np.linalg.norm(a_lat_vec, axis=1)

        feats["approach_speed_agent"]  = pd.Series(a_along, index=ctx.idx, dtype="float32")
        feats["approach_speed_target"] = pd.Series(t_along, index=ctx.idx, dtype="float32")
        feats["approach_speed_rel"]    = pd.Series(rel_along, index=ctx.idx, dtype="float32")
        feats["lateral_speed_agent"]   = pd.Series(a_lat_speed, index=ctx.idx, dtype="float32")
        return feats

    def _feat_follow_pattern(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        """
        Đặc trưng hành vi FOLLOW:
          - Agent ở gần target
          - Cùng hướng (body + velocity)
          - Tốc độ vừa phải
          - Khoảng cách tương đối ổn định trong 0.5–1s
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- 1. CÁC ĐẠI LƯỢNG CƠ BẢN ---
        # Vector Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        rel_dist = np.linalg.norm(rel_vec, axis=1)
        rel_dist_s = pd.Series(rel_dist, index=idx, dtype="float32")
    
        # Speed agent/target
        a_speed = ctx.speed_series.astype("float32")
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32",
        )
    
        # Body vector: nose - tail/body_center
        parts_a = self._extract_parts_dict(ctx, ["head", "tail_base", "ear_left", "ear_right"])
        parts_t = self._extract_parts_dict(target_ctx, ["head", "tail_base", "ear_right", "ear_left"])
    
        def body_vec(parts_dict):
            head = parts_dict.get("head")
            tail = parts_dict.get("tail_base")
            if head is None or tail is None:
                return None
            return head - tail
    
        a_body = body_vec(parts_a)
        t_body = body_vec(parts_t)
    
        if a_body is not None and t_body is not None:
            dot_bt = np.sum(a_body * t_body, axis=1)
            mag_bt = np.linalg.norm(a_body, axis=1) * np.linalg.norm(t_body, axis=1)
            cos_body = np.clip(dot_bt / (mag_bt + 1e-6), -1.0, 1.0)
            cos_body_s = pd.Series(cos_body, index=idx, dtype="float32")
        else:
            cos_body_s = zero()
    
        # Velocity hướng
        a_vel = ctx.vel
        t_vel = target_ctx.vel
        a_speed_np = np.linalg.norm(a_vel, axis=1)
        t_speed_np = np.linalg.norm(t_vel, axis=1)
        moving_mask = (a_speed_np > 1e-3) & (t_speed_np > 1e-3)
    
        # cos giữa hướng velocity 2 con
        dot_v = np.sum(a_vel * t_vel, axis=1)
        mag_v = a_speed_np * t_speed_np + 1e-6
        cos_vel = np.zeros_like(dot_v, dtype="float32")
        cos_vel[moving_mask] = np.clip(dot_v[moving_mask] / mag_v[moving_mask], -1.0, 1.0)
        cos_vel_s = pd.Series(cos_vel, index=idx, dtype="float32")
    
        # --- 2. WINDOW NGẮN (FOLLOW LÀ PATTERN DÀI HƠN ATTACK) ---
        for w30 in [15, 30, 60]:   # ~0.5s, 1s, 2s
            ws = self._scale(w30)
            min_p = max(ws // 3, 1)
    
            # Khoảng cách trung bình & độ dao động
            m_dist = rel_dist_s.rolling(ws, min_periods=min_p).mean()
            s_dist = rel_dist_s.rolling(ws, min_periods=min_p).std()
    
            # Cùng hướng (body + velocity)
            m_cos_body = cos_body_s.rolling(ws, min_periods=min_p).mean()
            m_cos_vel  = cos_vel_s.rolling(ws, min_periods=min_p).mean()
    
            # Tốc độ vừa phải
            m_sp_a = a_speed.rolling(ws, min_periods=min_p).mean()
            m_sp_t = t_speed.rolling(ws, min_periods=min_p).mean()
    
            feats[f"follow_dist_mean_{w30}"] = m_dist
            feats[f"follow_dist_std_{w30}"]  = s_dist
            feats[f"follow_cos_body_mean_{w30}"] = m_cos_body
            feats[f"follow_cos_vel_mean_{w30}"]  = m_cos_vel
            feats[f"follow_speed_agent_mean_{w30}"] = m_sp_a
            feats[f"follow_speed_target_mean_{w30}"] = m_sp_t
    
        # Clean
        for k, v in feats.items():
            feats[k] = (
                v.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats
    
    def _feat_shortburst_social(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        feats = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- Lấy lại vài quantity cơ bản từ pairwise/avoidance ---
        # vector Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        rel_dist = np.linalg.norm(rel_vec, axis=1)
        rel_dist_s = pd.Series(rel_dist, index=idx, dtype="float32")
    
        # unit vector
        rel_dist_safe = np.where(rel_dist == 0, 1e-6, rel_dist)
        u_vec = rel_vec / rel_dist_safe[:, None]
    
        # velocity dọc trục nối (approach speed)
        a_vel = ctx.vel
        t_vel = target_ctx.vel
        a_along = np.sum(a_vel * u_vec, axis=1)                # +: lao vào target
        t_along = np.sum(t_vel * (-u_vec), axis=1)             # +: target lao vào agent
        rel_along = np.sum((a_vel - t_vel) * u_vec, axis=1)    # +: lại gần nhau
    
        a_along_s = pd.Series(a_along, index=idx, dtype="float32")
        t_along_s = pd.Series(t_along, index=idx, dtype="float32")
        rel_along_s = pd.Series(rel_along, index=idx, dtype="float32")
    
        # speed agent / target
        a_speed = ctx.speed_series
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32"
        )
    
        # heading_rel_cos ~ escape / approach
        # vector body của agent
        # (reuse idea từ _feat_pairwise)
        # head ~ nose, tail ~ tail_base/body_center
        parts_a = self._extract_parts_dict(ctx, ["head", "tail_base"])
        head_a = parts_a.get("head")
        tail_a = parts_a.get("tail_base")
    
        if head_a is not None and tail_a is not None:
            body_vec_a = head_a - tail_a
            dot = np.sum(body_vec_a * rel_vec, axis=1)
            mag = np.linalg.norm(body_vec_a, axis=1) * rel_dist_safe
            heading_cos = np.clip(dot / (mag + 1e-6), -1.0, 1.0)
            heading_cos_s = pd.Series(heading_cos, index=idx, dtype="float32")
        else:
            heading_cos_s = zero()
    
        # --- Rolling window 10, 20, 30 frames (ở fps gốc) ---
        for w30 in [10, 20, 30]:
            ws = self._scale(w30)
            min_p = max(1, ws // 3)
    
            # Attack-like: approach mạnh, khoảng cách giảm nhanh
            feats[f"sb_att_approach_mean_{w30}"] = a_along_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_att_rel_along_mean_{w30}"] = rel_along_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_att_dist_delta_{w30}"] = (rel_dist_s - rel_dist_s.shift(ws)).fillna(0.0)
    
            # Chase-like: agent & target đều nhanh, dist tương đối nhỏ
            feats[f"sb_chase_speed_agent_mean_{w30}"] = a_speed.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_chase_speed_target_mean_{w30}"] = t_speed.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_chase_dist_mean_{w30}"] = rel_dist_s.rolling(ws, min_periods=min_p).mean()
    
            # Escape-like: heading ngược, dist tăng nhanh
            feats[f"sb_esc_heading_cos_mean_{w30}"] = heading_cos_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_esc_dist_gain_{w30}"] = (rel_dist_s.shift(-ws) - rel_dist_s).fillna(0.0)
    
        # clip & fillna
        for k, v in feats.items():
            feats[k] = v.replace([np.inf, -np.inf], np.nan).fillna(0.0).astype("float32")
    
        return feats



    # --- Methods tương thích ---
    
    def build_pose_tensor(self, tracking: pd.DataFrame):
        """
        Chuyển dữ liệu tracking (DataFrame) sang Tensor [Frames, Mice, 2] và Dict chi tiết.
        """
        tracking = tracking.sort_values("video_frame")
        frames = np.sort(tracking["video_frame"].unique())
        
        pvid = tracking.pivot(
            index="video_frame", 
            columns=["mouse_id", "bodypart"], 
            values=["x", "y"]
        )
        pvid = pvid.reorder_levels([1, 2, 0], axis=1).sort_index(axis=1).astype("float32")
        mouse_ids = list(pvid.columns.get_level_values(0).unique())
        pos = np.full((len(frames), len(mouse_ids), 2), np.nan, dtype=np.float32)
        per_mouse_df = {}
        
        for i, mid in enumerate(mouse_ids):
            single = pvid[mid]
            per_mouse_df[mid] = single
            
            if "body_center" in single.columns.get_level_values(0):
                cx = single["body_center"]["x"]
                cy = single["body_center"]["y"]
            else:
                cx = single.xs("x", level=1, axis=1).mean(axis=1)
                cy = single.xs("y", level=1, axis=1).mean(axis=1)
            
            pos[:, i, 0] = cx.reindex(frames).values
            pos[:, i, 1] = cy.reindex(frames).values
            
        return frames, mouse_ids, pos, per_mouse_df

    def extract_agent_target(
        self, 
        frames: np.ndarray, 
        mouse_ids: List[Any], 
        pos: np.ndarray, 
        agent_id: Any, 
        target_id: Any, 
        per_mouse_df: Dict = None
    ) -> pd.DataFrame:
        """
        Trích xuất đặc trưng cho cặp (Agent, Target).
        """
        try:
            aid_idx = mouse_ids.index(agent_id)
        except ValueError:
            return pd.DataFrame() 

        # 1. Build Agent Context
        ctx_agent = self._build_context(
            frames, 
            pos[:, aid_idx, :], 
            per_mouse_df.get(agent_id) if per_mouse_df else None
        )

        # 2. Build Target Context
        ctx_target = None
        if self.cfg.use_pairwise and target_id is not None and target_id in mouse_ids:
             tid_idx = mouse_ids.index(target_id)
             ctx_target = self._build_context(
                 frames, 
                 pos[:, tid_idx, :], 
                 per_mouse_df.get(target_id) if per_mouse_df else None
             )

        # 3. Run all features
        all_data = {}
        for func_name, func in self.feature_registry.items():
            out_dict = func(ctx_agent, target_ctx=ctx_target)
            all_data.update(out_dict)

        df_out = pd.DataFrame(all_data, index=ctx_agent.idx)
        df_out = df_out.replace([np.inf, -np.inf], np.nan).fillna(0.0)
        
        return df_out.reindex(sorted(df_out.columns), axis=1)

# ===========================================================================================
# ===========================================================================================
# ===========================================================================================





from __future__ import annotations
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import gc
import itertools
import json
import time
from collections import defaultdict

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedGroupKFold

# (Trên Kaggle) dùng metric chính thức
import sys
sys.path.append("/kaggle/usr/lib/mabe-f-beta")
from metric import score   # hàm score(submission_df, dataset_df)

# =========================================================
# 1. ĐƯỜNG DẪN & CẤU HÌNH
# =========================================================

INPUT_DIR = Path("/kaggle/input/MABe-mouse-behavior-detection")
TRAIN_TRACKING_DIR = INPUT_DIR / "train_tracking"
TRAIN_ANNOTATION_DIR = INPUT_DIR / "train_annotation"
TEST_TRACKING_DIR = INPUT_DIR / "test_tracking"


WORKING_DIR = Path("/kaggle/working")
RESULTS_DIR = Path(r"/kaggle/input/results-xgb-fe")
RESULTS_DIR.mkdir(exist_ok=True, parents=True)

INDEX_COLS = ["video_id", "agent_id", "target_id", "video_frame"]

# hành vi “self” vs “pair” giống notebook (có thể chỉnh nếu muốn)
SELF_BEHAVIORS = [
    "biteobject", "climb", "dig", "exploreobject", "freeze",
    "genitalgroom", "huddle", "rear", "rest", "run", "selfgroom",
]
PAIR_BEHAVIORS = [
    "allogroom", "approach", "attack", "attemptmount", "avoid",
    "chase", "chaseattack", "defend", "disengage", "dominance",
    "dominancegroom", "dominancemount", "ejaculate", "escape",
    "flinch", "follow", "intromit", "mount", "reciprocalsniff",
    "shepherd", "sniff", "sniffbody", "sniffface", "sniffgenital",
    "submit", "tussle",
]


# =========================================================
# 2. ĐỌC METADATA & HELPER
# =========================================================

def load_metadata() -> pd.DataFrame:
    train_meta = pd.read_csv(INPUT_DIR / "train.csv")
    return train_meta


def get_video_params(video_id: Any, meta: pd.DataFrame) -> Tuple[float, float]:
    """Lấy fps, pix_per_cm cho video từ train.csv."""
    row = meta.loc[meta["video_id"] == video_id]
    if row.empty:
        raise KeyError(f"video_id={video_id} không có trong train.csv")
    row = row.iloc[0]

    # giống notebook: cột "frames per second" & "pix per cm (approx)"
    fps = float(row["frames_per_second"])
    pix_per_cm = float(row["pix_per_cm_approx"])
    if not np.isfinite(pix_per_cm) or pix_per_cm <= 0:
        pix_per_cm = 1.0
    return fps, pix_per_cm


def load_tracking(lab_id: str, video_id: Any) -> pd.DataFrame:
    """Đọc tracking parquet → pandas (schema: video_frame, mouse_id, bodypart, x, y)."""
    path = TRAIN_TRACKING_DIR / str(lab_id) / f"{video_id}.parquet"
    if not path.exists():
        raise FileNotFoundError(path)
    df = pd.read_parquet(path)
    return df

def load_tracking_test(lab_id: str, video_id: Any) -> pd.DataFrame:
    """Đọc tracking parquet của test → pandas."""
    path = INPUT_DIR / "test_tracking" / str(lab_id) / f"{video_id}.parquet"
    if not path.exists():
        raise FileNotFoundError(path)
    return pd.read_parquet(path)


def load_annotation(lab_id: str, video_id: Any) -> pd.DataFrame:
    """Đọc annotation (agent_id, target_id, action, start_frame, stop_frame)."""
    path = TRAIN_ANNOTATION_DIR / str(lab_id) / f"{video_id}.parquet"
    if not path.exists():
        # không có label cho video này
        return pd.DataFrame(
            columns=["agent_id", "target_id", "action", "start_frame", "stop_frame"]
        )
    ann = pd.read_parquet(path)
    return ann[["agent_id", "target_id", "action", "start_frame", "stop_frame"]]


# =========================================================
# 3. TÍNH FEATURE PER-FRAME BẰNG FEATUREEXTRACTOR
# =========================================================

# Cache: (lab, video, agent, target) -> (frames, feature_df)
_feature_cache: Dict[Tuple[str, int, int, int], Tuple[np.ndarray, pd.DataFrame]] = {}


def get_frame_features_for_pair(
    lab_id: str,
    video_id: int,
    agent_id: int,
    target_id: int,
    meta: pd.DataFrame,
) -> Tuple[np.ndarray, pd.DataFrame]:
    """
    Tính (hoặc lấy cache) feature per-frame cho 1 video + (agent, target).
    Trả về: frames [F], features_df [F, D]
    """
    key = (str(lab_id), int(video_id), int(agent_id), int(target_id))
    if key in _feature_cache:
        return _feature_cache[key]

    fps, pix_per_cm = get_video_params(video_id, meta)
    tracking = load_tracking(lab_id, video_id)

    fe = FeatureExtractor(
        fps=fps,
        pix_per_cm=pix_per_cm,
        smooth_sigma=1.0,
        use_pairwise=True,
    )

    frames, mouse_ids, pos, per_mouse_df = fe.build_pose_tensor(tracking)

    # agent/target có thể là cùng chuột (self) hoặc khác chuột (pair)
    features_df: pd.DataFrame = fe.extract_agent_target(
        frames=frames,
        mouse_ids=mouse_ids,
        pos=pos,
        agent_id=agent_id,
        target_id=target_id,
        per_mouse_df=per_mouse_df,
    )
    # index chính là frame
    features_df.index = frames

    _feature_cache[key] = (frames, features_df)
    return frames, features_df

_feature_cache: Dict[Tuple[str, int, Any, Any], Tuple[np.ndarray, pd.DataFrame]] = {}

def get_frame_features_for_pair_test(
    lab_id: str,
    video_id: int,
    agent_id: Any,
    target_id: Any,
    test_meta: pd.DataFrame,
) -> Tuple[np.ndarray, pd.DataFrame]:
    """
    Feature per-frame cho test (video_id, agent, target).
    Trả về: frames [F], features_df [F, D]
    """
    key = (f"test_{lab_id}", int(video_id), agent_id, target_id)
    if key in _feature_cache:
        return _feature_cache[key]

    # Lấy fps, pix_per_cm_approx từ test.csv
    row = test_meta[test_meta["video_id"] == video_id].iloc[0]
    fps = float(row["frames_per_second"])
    pix_per_cm = float(row["pix_per_cm_approx"])
    if not np.isfinite(pix_per_cm) or pix_per_cm <= 0:
        pix_per_cm = 1.0

    tracking = load_tracking_test(lab_id, video_id)

    fe = FeatureExtractor(
        fps=fps,
        pix_per_cm=pix_per_cm,
        smooth_sigma=1.0,
        use_pairwise=True,
    )

    frames, mouse_ids, pos, per_mouse_df = fe.build_pose_tensor(tracking)

    features_df = fe.extract_agent_target(
        frames=frames,
        mouse_ids=mouse_ids,
        pos=pos,
        agent_id=agent_id,
        target_id=target_id,
        per_mouse_df=per_mouse_df,
    )
    features_df.index = frames

    _feature_cache[key] = (frames, features_df)
    return frames, features_df



# =========================================================
# 4. BUILD FRAME-LEVEL DATASET CHO 1 (lab_id, behavior)
# =========================================================

def build_frame_dataset_for_lab_behavior(
    lab_id: str,
    behavior: str,
    train_meta: pd.DataFrame,
    mode: str = "self",
) -> Tuple[pd.DataFrame, pd.DataFrame, np.ndarray]:
    """
    Xây tập frame-level (indices, features, labels) cho 1 (lab, behavior).

    indices: DataFrame với cột INDEX_COLS
    features: DataFrame per-frame features
    labels: np.ndarray nhị phân (0/1)
    """

    videos = (
        train_meta[train_meta["lab_id"] == lab_id]["video_id"]
        .unique()
        .tolist()
    )

    index_list = []
    feature_list = []
    label_list = []

    for video_id in videos:
        ann = load_annotation(lab_id, video_id)
        if ann.empty:
            continue

        # chỉ lấy annotation của behavior này
        ann_bhv = ann[ann["action"] == behavior]
        if ann_bhv.empty:
            continue

        # các (agent, target) cần xem
        pairs = ann_bhv[["agent_id", "target_id"]].drop_duplicates().values.tolist()
        for (agent_id, target_id) in pairs:
            if mode == "self":
                target_id_use = agent_id
            else:
                target_id_use = target_id

            frames, feat_df = get_frame_features_for_pair(
                lab_id=lab_id,
                video_id=video_id,
                agent_id=agent_id,
                target_id=target_id_use,
                meta=train_meta,
            )

            # label per-frame: frame ∈ bất kỳ [start, stop) của (agent,target,behavior)
            ann_pair = ann_bhv[
                (ann_bhv["agent_id"] == agent_id)
                & (ann_bhv["target_id"] == target_id)
            ]
            if ann_pair.empty and mode == "self":
                ann_pair = ann_bhv[ann_bhv["agent_id"] == agent_id]

            pos_frames = set()
            for _, r in ann_pair.iterrows():
                pos_frames.update(range(int(r["start_frame"]), int(r["stop_frame"])))

            if len(pos_frames) == 0:
                continue

            label = np.isin(frames, list(pos_frames)).astype("int8")
            if label.sum() == 0:
                continue

            idx_df = pd.DataFrame(
                {
                    "video_id": video_id,
                    "agent_id": agent_id,
                    "target_id": target_id,
                    "video_frame": frames,
                }
            )

            index_list.append(idx_df)
            feature_list.append(feat_df.reset_index(drop=True))
            label_list.append(label)

    if not index_list:
        return (
            pd.DataFrame(columns=INDEX_COLS),
            pd.DataFrame(),
            np.zeros(0, dtype="int8"),
        )

    indices = pd.concat(index_list, ignore_index=True)
    features = pd.concat(feature_list, ignore_index=True)
    labels = np.concatenate(label_list).astype("int8")

    assert len(indices) == len(features) == len(labels)

    return indices, features, labels


# =========================================================
# 5. TRAIN + OOF CHO 1 (lab_id, behavior)
# =========================================================

def tune_threshold(oof_pred: np.ndarray, y: np.ndarray) -> float:
    ths = np.arange(0.0, 1.005, 0.005)
    scores = [f1_score(y, (oof_pred >= th), zero_division=0) for th in ths]
    return float(ths[int(np.argmax(scores))])

#
def train_validate_one(
    lab_id: str,
    behavior: str,
    indices: pd.DataFrame,
    features: pd.DataFrame,
    labels: np.ndarray,
) -> float:
    """
    Train XGBoost binary cho 1 (lab, behavior) + lưu OOF prediction.
    Trả về: F1 trên toàn bộ OOF (frame-level).
    """
    result_dir = RESULTS_DIR / lab_id / behavior
    result_dir.mkdir(parents=True, exist_ok=True)

    n = len(labels)

    if n == 0 or labels.sum() == 0:
        oof_df = indices.copy()
        oof_df["fold"] = -1
        oof_df["prediction"] = 0.0
        oof_df["predicted_label"] = 0
        oof_df.to_parquet(result_dir / "oof_predictions.parquet", index=False)
        (result_dir / "f1.txt").write_text("0.0\n")
        return 0.0

    X = features.values.astype("float32")
    y = labels.astype("int8")
    groups = indices["video_id"].values

    folds = np.ones(n, dtype="int8") * -1
    oof_pred = np.zeros(n, dtype="float32")
    oof_label = np.zeros(n, dtype="int8")

    cv = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=42)

    for fold, (tr_idx, va_idx) in enumerate(cv.split(X, y, groups=groups)):
        fold_dir = result_dir / f"fold_{fold}"
        fold_dir.mkdir(parents=True, exist_ok=True)

        X_tr, y_tr = X[tr_idx], y[tr_idx]
        X_va, y_va = X[va_idx], y[va_idx]

        # scale_pos_weight
        pos = y_tr.sum()
        neg = len(y_tr) - pos
        scale_pos_weight = float(neg / pos) if pos > 0 else 1.0

        params = {
            "objective": "binary:logistic",
            "eval_metric": "logloss",
            "device": "cuda",
            "tree_method": "hist",
            "learning_rate": 0.05,
            "max_depth": 6,
            "min_child_weight": 5,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "scale_pos_weight": scale_pos_weight,
            "max_bin": 64,
            "seed": 42,
        }

        dtrain = xgb.QuantileDMatrix(
            X_tr,
            label=y_tr,
            feature_names=features.columns.tolist(),
            max_bin=64,
        )
        dvalid = xgb.DMatrix(
            X_va,
            label=y_va,
            feature_names=features.columns.tolist(),
        )

        evals_result: Dict[str, Dict[str, List[float]]] = {}

        early_stop = xgb.callback.EarlyStopping(
            rounds=10, metric_name="logloss", data_name="valid", maximize=False
        )

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=250,
            evals=[(dtrain, "train"), (dvalid, "valid")],
            callbacks=[early_stop],
            evals_result=evals_result,
            verbose_eval=False,
        )

        pred_va = model.predict(dvalid)
        th = tune_threshold(pred_va, y_va)

        folds[va_idx] = fold
        oof_pred[va_idx] = pred_va
        oof_label[va_idx] = (pred_va >= th).astype("int8")

        model.save_model(fold_dir / "model.json")
        with open(fold_dir / "threshold.txt", "w") as f:
            f.write(f"{th}\n")

    # lưu OOF
    oof_df = indices.copy()
    oof_df["fold"] = folds
    oof_df["prediction"] = oof_pred
    oof_df["predicted_label"] = oof_label
    oof_df.to_parquet(result_dir / "oof_predictions.parquet", index=False)

    f1 = f1_score(y, oof_label, zero_division=0)
    (result_dir / "f1.txt").write_text(f"{f1:.6f}\n")
    return float(f1)

def load_models_for_behavior_infer(lab_id: str, behavior: str):
    """
    Đọc các fold model + threshold cho (lab, behavior) từ RESULTS_DIR.
    Dùng cho inference (test).
    """
    base_dir = RESULTS_DIR / lab_id / behavior
    if not base_dir.exists():
        return []

    models = []
    for fold_dir in sorted(base_dir.glob("fold_*")):
        model_file = fold_dir / "model.json"
        thr_file = fold_dir / "threshold.txt"
        if not model_file.exists():
            continue

        booster = xgb.Booster()
        booster.load_model(str(model_file))

        if thr_file.exists():
            thr = float(thr_file.read_text().strip())
        else:
            thr = 0.5

        models.append((booster, thr))

    return models


# =========================================================
# 6. LOOP QUA TẤT CẢ BEHAVIORS TRONG 1 LAB
#    (train_all_labs_behaviors vẫn giữ nguyên, nhưng main
#     sẽ filter train_meta chỉ còn 1 lab)
# =========================================================

def train_all_labs_behaviors(train_meta: pd.DataFrame):
    """
    Loop qua từng lab trong train_meta (ở đây main đã filter chỉ còn 1 lab):
      - đọc annotation của tất cả video
      - lấy unique action xuất hiện trong lab đó
      - train 1 model/frame-level cho từng (lab, action)
    """
    labs = train_meta["lab_id"].unique().tolist()

    start_time = time.perf_counter()

    for lab_id in labs:
        # tập video của lab này
        videos = train_meta[train_meta["lab_id"] == lab_id]["video_id"].unique().tolist()

        # gom toàn bộ action thực sự có trong annotation của lab này
        behaviors_set = set()
        for vid in videos:
            ann = load_annotation(lab_id, vid)
            if ann.empty:
                continue
            behaviors_set.update(ann["action"].unique().tolist())

        behaviors = sorted(behaviors_set)
        print(f"\n===== LAB {lab_id}: {len(behaviors)} behaviors =====")

        for behavior in behaviors:
            # if behavior != "submit": continue

            mode = "self" if behavior in SELF_BEHAVIORS else "pair"

            print(f"\n=== LAB={lab_id} | behavior={behavior} | mode={mode} ===")
            indices, features, labels = build_frame_dataset_for_lab_behavior(
                lab_id=str(lab_id),
                behavior=behavior,
                train_meta=train_meta,
                mode=mode,
            )
            print(
                f"frames: {len(labels):,}, positives: {labels.sum():,}, features: "
                f"{features.shape[1] if not features.empty else 0}"
            )

            if len(labels) == 0:
                print(" -> skip (no samples)")
                continue

            f1 = train_validate_one(str(lab_id), behavior, indices, features, labels)
            elapsed = time.perf_counter() - start_time
            print(f" -> OOF F1 (frame-level): {f1:.3f} | elapsed={elapsed/60:.1f} min")



# =========================================================
# 7. GOM OOF PREDICTION → SEGMENT & TÍNH SCORE()
# =========================================================

def build_oof_submission_from_parquet(
    target_lab_id: Optional[str] = None,
) -> pd.DataFrame:
    """
    Đọc tất cả oof_predictions.parquet trong RESULTS_DIR,
    gom thành frame-level table rồi nối thành segment-level prediction
    giống inference notebook (simplified).

    Nếu target_lab_id != None thì chỉ lấy OOF của lab đó
    (vd "AdaptableSnail").
    """
    oof_files = list(RESULTS_DIR.glob("*/**/oof_predictions.parquet"))
    if not oof_files:
        raise RuntimeError("Không tìm thấy OOF parquet, hãy train trước.")

    frame_preds = []

    for path in oof_files:
        # path: results_xgb_fe/lab/behavior/oof_predictions.parquet
        parts = path.parts
        behavior = parts[-2]
        lab_id = parts[-3]

        # chỉ lấy file thuộc lab mong muốn (nếu có)
        if target_lab_id is not None and lab_id != target_lab_id:
            continue

        df = pd.read_parquet(path)
        df = df[INDEX_COLS + ["prediction"]].copy()
        df["lab_id"] = lab_id
        df["action"] = behavior
        frame_preds.append(df)

    if not frame_preds:
        raise RuntimeError(
            f"Không có OOF predictions nào cho lab_id={target_lab_id}"
        )

    frame_df = pd.concat(frame_preds, ignore_index=True)

    # sắp xếp
    frame_df = frame_df.sort_values(
        ["lab_id", "video_id", "agent_id", "target_id", "action", "video_frame"]
    ).reset_index(drop=True)

    # Convert frame-level prob -> hard label + segments
    segments = []
    for (lab_id, video_id, agent_id, target_id, action), group in frame_df.groupby(
        ["lab_id", "video_id", "agent_id", "target_id", "action"], sort=False
    ):
        frames = group["video_frame"].values
        scores = group["prediction"].values

        # dùng một threshold fix (vd 0.5) cho demo
        # (hoặc bạn có thể lưu threshold per (lab,behavior) và apply)
        hard = scores >= 0.5

        in_seg = False
        start = None
        prev_f = None

        for f, h in zip(frames, hard):
            if h and not in_seg:
                in_seg = True
                start = int(f)
            elif (not h) and in_seg:
                stop = int(prev_f + 1)  # [start, stop)
                segments.append(
                    {
                        "lab_id": lab_id,
                        "video_id": int(video_id),
                        "agent_id": int(agent_id),
                        "target_id": int(target_id),
                        "action": action,
                        "start_frame": start,
                        "stop_frame": stop,
                    }
                )
                in_seg = False
            prev_f = f

        if in_seg:
            stop = int(frames[-1] + 1)
            segments.append(
                {
                    "lab_id": lab_id,
                    "video_id": int(video_id),
                    "agent_id": int(agent_id),
                    "target_id": int(target_id),
                    "action": action,
                    "start_frame": start,
                    "stop_frame": stop,
                }
            )

    if not segments:
        return pd.DataFrame(
            columns=[
                "lab_id",
                "video_id",
                "agent_id",
                "target_id",
                "action",
                "start_frame",
                "stop_frame",
            ]
        )

    submission = pd.DataFrame(segments)
    submission = submission.sort_values(
        ["lab_id", "video_id", "agent_id", "target_id", "action", "start_frame"]
    ).reset_index(drop=True)

    return submission

BAD_VIDEOS = []

def compute_validation_score(
    submission: pd.DataFrame,
    lab_id: Optional[str] = None,
) -> float:
    """
    Gọi metric `score()` chính thức trên train set.
    Nếu lab_id != None → chỉ validate trên lab đó.
    """
    # ===== THAY ĐỔI Ở ĐÂY =====
    # Không dùng train.csv, mà phải đọc toàn bộ annotations
    train_meta = pd.read_csv(INPUT_DIR / "train.csv")
    
    if lab_id is not None:
        train_meta = train_meta[train_meta["lab_id"] == lab_id].reset_index(drop=True)

    if BAD_VIDEOS:
        train_meta = train_meta[~train_meta["video_id"].isin(BAD_VIDEOS)]
    
    # Đọc tất cả annotation files
    all_annotations = []
    for _, row in train_meta.iterrows():
        lab = row["lab_id"]
        vid = row["video_id"]
        ann = load_annotation(lab, vid)
        if not ann.empty:
            ann["lab_id"] = lab
            ann["video_id"] = vid
            ann["behaviors_labeled"] = row["behaviors_labeled"]
            all_annotations.append(ann)
    
    if not all_annotations:
        print("Không có annotation nào để validate!")
        return 0.0
    
    dataset = pd.concat(all_annotations, ignore_index=True)
    
    # Filter submission theo lab nếu cần
    if lab_id is not None:
        submission = submission[submission["lab_id"] == lab_id].reset_index(drop=True)
    
    # ===== GỌI METRIC =====
    s = score(dataset, submission, row_id_column_name="row_id")

    print(
        f"Official validation score"
        f"{' (lab=' + lab_id + ')' if lab_id is not None else ''}: {s:.6f}"
    )
    return float(s)



# =========================================================
# 8. MAIN
# =========================================================
def str_to_mouse_id(s: str) -> int:
    if s == "self":
        return -1
    return int(str(s).replace("mouse", ""))


def predict_behaviors_for_pair(
    lab_id: str,
    video_id: int,
    agent_internal_id: Any,
    target_internal_id: Any,
    behaviors: List[str],
    test_meta: pd.DataFrame,
) -> pd.DataFrame:
    """
    Chạy inference cho 1 cặp (video, agent_internal_id, target_internal_id)
    với list behaviors (cùng mode: all self hoặc all pair).
    Trả về segment-level DataFrame: video_id, action, start_frame, stop_frame.
    """
    if lab_id != "GroovyShrew": return None
    frames, feat_df = get_frame_features_for_pair_test(
        lab_id=lab_id,
        video_id=video_id,
        agent_id=agent_internal_id,
        target_id=target_internal_id,
        test_meta=test_meta,
    )
    if feat_df.empty:
        return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])

    feat_df = feat_df.astype("float32")
    n_frames = len(feat_df)

    scores_per_behavior = {}
    for behavior in behaviors:
        models = load_models_for_behavior_infer(lab_id, behavior)
        if not models:
            continue

        req_feats = models[0][0].feature_names
        # Build X_test với đúng bộ feature của model
        X_test = pd.DataFrame(
            0.0,
            index=feat_df.index,
            columns=req_feats,
            dtype=np.float32,
        )
        common = list(set(req_feats) & set(feat_df.columns))
        if common:
            X_test[common] = feat_df[common]

        dtest = xgb.DMatrix(X_test, feature_names=req_feats)

        agg_scores = np.zeros(n_frames, dtype=np.float32)
        for booster, thr in models:
            probs = booster.predict(dtest)
            labels = (probs >= thr).astype(np.int8)
            agg_scores += probs * labels

        agg_scores /= max(len(models), 1)
        scores_per_behavior[behavior] = agg_scores

        del dtest, X_test
        gc.collect()

    if not scores_per_behavior:
        return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])

    beh_list = list(scores_per_behavior.keys())
    score_mat = np.vstack([scores_per_behavior[b] for b in beh_list]).T  # [F, B]

    max_idx = score_mat.argmax(axis=1)
    max_scores = score_mat.max(axis=1)
    labels = np.where(max_scores == 0.0, "none", np.array(beh_list)[max_idx])

    # frame-level → segment
    segments = []
    prev_lab = "none"
    prev_start = None
    prev_f = None

    for f, lab in zip(frames, labels):
        if lab != prev_lab:
            if prev_lab != "none":
                segments.append(
                    {
                        "video_id": int(video_id),
                        "action": prev_lab,
                        "start_frame": int(prev_start),
                        "stop_frame": int(prev_f + 1),
                    }
                )
            prev_lab = lab
            prev_start = f
        prev_f = f

    if prev_lab != "none":
        segments.append(
            {
                "video_id": int(video_id),
                "action": prev_lab,
                "start_frame": int(prev_start),
                "stop_frame": int(prev_f + 1),
            }
        )

    if not segments:
        return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])

    return pd.DataFrame(segments)



target_lab = "GroovyShrew"
print(f"Đọc test.csv cho lab {target_lab} ...")
test_meta = pd.read_csv(INPUT_DIR / "test.csv")
test_meta = test_meta[test_meta["lab_id"] == target_lab].reset_index(drop=True)

# Lấy danh sách behavior đã train (thư mục con trong RESULTS_DIR/AdaptableSnail)
lab_result_dir = RESULTS_DIR / target_lab
if lab_result_dir.exists():
    trained_behaviors = sorted(
        [p.name for p in lab_result_dir.iterdir() if p.is_dir()]
    )
else:
    trained_behaviors = []

self_behaviors_in_lab = [b for b in trained_behaviors if b in SELF_BEHAVIORS]
pair_behaviors_in_lab = [b for b in trained_behaviors if b in PAIR_BEHAVIORS]

print("Behaviors (self) dùng để predict:", self_behaviors_in_lab)
print("Behaviors (pair) dùng để predict:", pair_behaviors_in_lab)

all_segments = []

# Loop từng video test của lab
for video_id in sorted(test_meta["video_id"].unique()):
    print(f"Predict video_id={video_id} ...")

    tracking = load_tracking_test(target_lab, video_id)
    mouse_ids_internal = sorted(tracking["mouse_id"].unique().tolist())

    # Map internal mouse_id -> string để đưa vào submission
    def to_submit_id(mid):
        s = str(mid)
        return s if s.startswith("mouse") else f"mouse{s}"

    # SELF behaviors: agent == target (self)
    if self_behaviors_in_lab:
        for mid in mouse_ids_internal:
            seg_df = predict_behaviors_for_pair(
                lab_id=target_lab,
                video_id=video_id,
                agent_internal_id=mid,
                target_internal_id=mid,  # self
                behaviors=self_behaviors_in_lab,
                test_meta=test_meta,
            )
            if not seg_df.empty:
                seg_df["agent_id"] = to_submit_id(mid)
                seg_df["target_id"] = "self"
                all_segments.append(seg_df)

    # PAIR behaviors: mọi cặp agent != target
    if pair_behaviors_in_lab and len(mouse_ids_internal) > 1:
        for agent_internal, target_internal in itertools.permutations(
            mouse_ids_internal, 2
        ):
            seg_df = predict_behaviors_for_pair(
                lab_id=target_lab,
                video_id=video_id,
                agent_internal_id=agent_internal,
                target_internal_id=target_internal,
                behaviors=pair_behaviors_in_lab,
                test_meta=test_meta,
            )
            if not seg_df.empty:
                seg_df["agent_id"] = to_submit_id(agent_internal)
                seg_df["target_id"] = to_submit_id(target_internal)
                all_segments.append(seg_df)

# Gộp tất cả segments → submission.csv
# Gộp tất cả segments → submission2.csv
if all_segments:
    submission4 = pd.concat(all_segments, ignore_index=True)
    submission4 = submission4[
        ["video_id", "agent_id", "target_id", "action", "start_frame", "stop_frame"]
    ]
    submission4 = submission4.sort_values(
        ["video_id", "agent_id", "target_id", "action", "start_frame"]
    ).reset_index(drop=True)
else:
    # DataFrame rỗng, KHÔNG dummy row
    submission4 = pd.DataFrame(
        columns=[
            "video_id",
            "agent_id",
            "target_id",
            "action",
            "start_frame",
            "stop_frame",
        ]
    )

# Thêm row_id (kể cả khi rỗng)
submission4.insert(0, "row_id", np.arange(len(submission4), dtype=np.int64))

sub_path = WORKING_DIR / "submission4.csv"
submission4.to_csv(sub_path, index=False)
print(f"Saved GroovyShrew submission to {sub_path}")


Đọc test.csv cho lab GroovyShrew ...
Behaviors (self) dùng để predict: ['climb', 'dig', 'rear', 'rest', 'run', 'selfgroom']
Behaviors (pair) dùng để predict: ['approach', 'attemptmount', 'defend', 'escape', 'sniff', 'sniffgenital']
Saved GroovyShrew submission to /kaggle/working/submission4.csv


# JovialSwallow

In [8]:
from pathlib import Path
import shutil
import gc

WORKING_DIR = Path("/kaggle/working")

# 1) Xóa mọi thứ trong /kaggle/working trừ .csv
for path in WORKING_DIR.iterdir():
    # giữ lại file .csv
    if path.is_file() and path.suffix == ".csv":
        continue

    if path.is_file():
        try:
            path.unlink()
        except Exception as e:
            print(f"Cannot remove file {path}: {e}")
    elif path.is_dir():
        try:
            shutil.rmtree(path, ignore_errors=True)
        except Exception as e:
            print(f"Cannot remove dir {path}: {e}")


gc.collect()

78

In [9]:
from __future__ import annotations
from typing import Dict, List, Tuple, Any, Optional
import warnings
from dataclasses import dataclass, field
import numpy as np
import pandas as pd
from scipy.ndimage import gaussian_filter1d
from tqdm import tqdm

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
np.seterr(invalid="ignore", divide="ignore")

# =============================================================================
# 1. CONFIGURATION
# =============================================================================
@dataclass
class FeatureConfig:
    """
    Chứa cấu hình tham số (Hyperparameters).
    """
    fps: float = 30.0
    pix_per_cm: float = 1.0
    smooth_sigma: float = 1.0
    use_pairwise: bool = True


# =============================================================================
# 2. AGENT CONTEXT
# =============================================================================
@dataclass
class AgentContext:
    """
    Container chứa dữ liệu đã tiền xử lý của một con chuột.
    Giúp tránh việc tính toán lại vận tốc/gia tốc nhiều lần.
    """
    idx: pd.Index          # Index frame
    pos: np.ndarray        # [F, 2] cm
    vel: np.ndarray        # [F, 2] cm/s
    speed: np.ndarray      # [F, 1] cm/s
    acc: np.ndarray        # [F, 2] cm/s^2
    
    cx: pd.Series          # Series tọa độ X (để dùng rolling)
    cy: pd.Series          # Series tọa độ Y
    speed_series: pd.Series # Series tốc độ
    
    raw_df: Optional[pd.DataFrame] = None # Dữ liệu gốc các bộ phận 


# =============================================================================
# 3. FEATURE EXTRACTOR
# =============================================================================
class FeatureExtractor:
    """
    Class trích xuất đặc trưng hành vi từ dữ liệu tracking.
    """
    def __init__(self, fps: float, pix_per_cm: float, smooth_sigma: float = 1.0, use_pairwise: bool = True):
        # Map tham số từ init vào Config
        self.cfg = FeatureConfig(
            fps=float(fps), 
            pix_per_cm=float(pix_per_cm), 
            smooth_sigma=smooth_sigma,
            use_pairwise=use_pairwise
        )
        
        # Đăng ký các hàm feature sẽ chạy
        self.feature_registry = {
            "kinematics": self._feat_basic_kinematics,
            "multiscale": self._feat_multiscale,
            "long_range": self._feat_long_range,
            "cumulative": self._feat_cumulative,
            "curvature": self._feat_curvature,
            "speed_asym": self._feat_speed_asym,
            "gauss_shift": self._feat_gauss_shift,
            "avoid": self._feat_attack_sniff,
            # "pose": self._feat_pose_shape,
            "a": self._feat_follow_pattern,
            "b": self._feat_shortburst_social,
            # "pairwise": self._feat_pairwise
        }

    # --- Helpers ---
    def _scale(self, n_frames_30fps: int) -> int:
        """Quy đổi số frame từ chuẩn 30fps sang fps thực tế của video."""
        return max(1, int(round(n_frames_30fps * self.cfg.fps / 30.0)))

    def _to_cm(self, arr):
        """Chuyển pixel -> cm."""
        return arr / self.cfg.pix_per_cm

    def _smooth(self, x):
        """Làm mượt dữ liệu bằng Gaussian filter."""
        if self.cfg.smooth_sigma is None or x.shape[0] < 3: return x
        if np.all(np.isnan(x)): return x
        return gaussian_filter1d(x, sigma=self.cfg.smooth_sigma, axis=0, mode="nearest")

    def _forward_fill_nan(self, pos):
        """
        Điền dữ liệu thiếu (NaN) bằng giá trị hợp lệ trước đó (Forward Fill).
        """
        if np.all(np.isnan(pos)):
            return np.zeros_like(pos)

        pos_ffill = pos.copy()
        mask = np.any(~np.isnan(pos_ffill), axis=1)
        if not mask.any():
            return np.zeros_like(pos_ffill)

        valid_idx = np.where(mask)[0]
        first, last = valid_idx[0], valid_idx[-1]
        pos_ffill[:first] = pos_ffill[first]
        pos_ffill[last + 1:] = pos_ffill[last]
        df_temp = pd.DataFrame(pos_ffill)
        df_temp = df_temp.ffill()
        return df_temp.to_numpy()
    
    def _speed_series(self, cx: pd.Series, cy: pd.Series) -> pd.Series:
        dx = cx.diff()
        dy = cy.diff()
        v = np.hypot(dx, dy).fillna(0.0) * self.cfg.fps
        return v.astype("float32")
    
    def _roll_future_mean(self, s: pd.Series, w: int, min_p: int = 1) -> pd.Series:
        return s.iloc[::-1].rolling(w, min_periods=min_p).mean().iloc[::-1]

    def _roll_future_var(self, s: pd.Series, w: int, min_p: int = 2) -> pd.Series:
        return s.iloc[::-1].rolling(w, min_periods=min_p).var().iloc[::-1]

    # --- Core Logic ---
    def _compute_kinematics(self, pos_px: np.ndarray):
        """
        Tính toán vật lý cơ bản: Pos(cm), Vel, Speed, Acc.
        Input: Array [Frames, 2] (pixel).
        Output: Tuple (pos_cm, vel, speed, acc).
        """
        pos_ffill = self._forward_fill_nan(pos_px)
        pos_cm = self._to_cm(pos_ffill.astype(np.float32))
        pos_cm = self._smooth(pos_cm)                                               # [F, 2]

        dt = 1.0 / self.cfg.fps
        vel = np.zeros_like(pos_cm, dtype=np.float32)
        vel[1:] = (pos_cm[1:] - pos_cm[:-1]) / dt                                   # [F, 2: (vx, vy)]
        speed = np.linalg.norm(vel, axis=1, keepdims=True).astype(np.float32)       # [F, 1]

        acc = np.zeros_like(pos_cm, dtype=np.float32)                          
        acc[1:] = (vel[1:] - vel[:-1]) / dt                                         # [F, 2:(ax, ay)]
        return pos_cm.astype(np.float32), vel, speed, acc

    def _build_context(self, frames, pos_px, mouse_df=None) -> AgentContext:
        """
        Tạo AgentContext chứa đầy đủ thông tin vật lý của 1 con chuột.
        """
        p, v, s, a = self._compute_kinematics(pos_px)
        idx = pd.Index(frames, name="frame")
        
        return AgentContext(
            idx=idx, pos=p, vel=v, speed=s, acc=a, 
            cx=pd.Series(p[:, 0], index=idx), 
            cy=pd.Series(p[:, 1], index=idx), 
            speed_series=pd.Series(s[:, 0], index=idx), 
            raw_df=mouse_df
        )

    # --- Feature Modules ---
    def _feat_basic_kinematics(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Lấy các giá trị thô: tọa độ x, y, vận tốc vx, vy, tốc độ, gia tốc ax, ay.
        """
        return {
            "a_x": ctx.pos[:, 0], "a_y": ctx.pos[:, 1],
            "a_vx": ctx.vel[:, 0], "a_vy": ctx.vel[:, 1],
            "a_speed": ctx.speed[:, 0],
            "a_ax": ctx.acc[:, 0], "a_ay": ctx.acc[:, 1]
        }

    def _feat_multiscale(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Tính tốc độ trung bình (Mean) và độ lệch chuẩn (Std) ở đa mức thời gian.
        Feature 'sp_ratio' đo độ bùng nổ (Burstiness).
        """
        feats = {}
        speed = ctx.speed_series
        frame_scales = [10, 40, 160]
        for scale in frame_scales:
            ws = self._scale(scale)
            if len(speed) >= ws:
                roller = speed.rolling(ws, min_periods=max(1, ws//4), center=True)
                feats[f"sp_m{scale}"] = roller.mean().astype("float32")
                feats[f"sp_s{scale}"] = roller.std().astype("float32")
        feats[f"sp_ratio"] = feats["sp_m10"] / (feats["sp_m160"] + 1e-6)
        return feats 

    
        
    def _feat_long_range(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Đặc trưng ngữ cảnh dài hạn:
        - x_ml, y_ml: Vị trí trung bình trong quá khứ.
        - sp_pct: Xếp hạng (percentile) của tốc độ hiện tại so với quá khứ.
        """
        feats: Dict[str, pd.Series] = {}
        speed = ctx.speed_series

        for window in [120, 240]:
            ws = self._scale(window)
            if len(ctx.cx) >= ws:
                feats[f"x_ml{window}"] = ctx.cx.rolling(ws, min_periods=max(5, ws // 6), center=True).mean()
                feats[f"y_ml{window}"] = ctx.cy.rolling(ws, min_periods=max(5, ws // 6), center=True).mean()

        for span in [60, 120]:
            s = self._scale(span)
            feats[f"x_e{span}"] = ctx.cx.ewm(span=s, min_periods=1).mean()
            feats[f"y_e{span}"] = ctx.cy.ewm(span=s, min_periods=1).mean()

        for window in [60, 120]:
            ws = self._scale(window)
            if len(speed) >= ws:
                feats[f"sp_pct{window}"] = speed.rolling(
                    ws, min_periods=max(5, ws // 6), center=True
                ).rank(pct=True)
        return feats
    

    def _feat_curvature(self, ctx: AgentContext, **kwargs) -> Dict:
        feats = {}

        vel_x, vel_y = ctx.vel[:, 0], ctx.vel[:, 1]
        acc_x, acc_y = ctx.acc[:, 0], ctx.acc[:, 1]
        cross_prod = vel_x * acc_y - vel_y * acc_x
        vel_mag = np.sqrt(vel_x**2 + vel_y**2)
        moving_mask = vel_mag > 2.0
        vel_mag_safe = np.maximum(vel_mag, 0.1 / self.cfg.fps)
        raw_curv = cross_prod / (vel_mag_safe**3)
        raw_curv = np.where(moving_mask, raw_curv, 0.0)
        min_turn_radius_cm = 0.5
        max_k = 1.0 / min_turn_radius_cm
        raw_curv = np.clip(raw_curv, -max_k, max_k)
        abs_curv = np.abs(raw_curv)
        abs_curv_series = pd.Series(abs_curv, index=ctx.idx)

        for w in [30, 60]:
            ws = self._scale(w)
            min_p = max(ws // 3, 1)
            feats[f"curv_mean_{w}"] = abs_curv_series.rolling(ws, min_periods=min_p).mean()

        angle = np.arctan2(vel_y, vel_x)
        angle_series = pd.Series(angle, index=ctx.idx)
        angle_change = np.abs(angle_series.diff().fillna(0.0))
        angle_change = np.where(angle_change > np.pi, 2 * np.pi - angle_change, angle_change)
        angle_change_series = pd.Series(angle_change, index=ctx.idx)
        angle_change_series = pd.Series(np.where(moving_mask, angle_change_series, 0.0), index=ctx.idx)

        ws = self._scale(30)
        feats["turn_rate_30"] = angle_change_series.rolling(ws, min_periods=max(ws // 3, 1)).sum()

        return feats
    
    def _feat_cumulative(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Tổng quãng đường di chuyển trong một khoảng thời gian dài xung quanh frame hiện tại.
        """
        feats = {}
        L = max(1, self._scale(180))
        step = np.hypot(ctx.cx.diff(), ctx.cy.diff()).fillna(0.0)
        path = step.rolling(2 * L + 1, min_periods=max(5, L // 6), center=True).sum()
        feats["path_cum180"] =  path.fillna(0.0).astype("float32")
        return feats

    def _feat_speed_asym(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Bất đối xứng tốc độ (Tương lai - Quá khứ).
        """
        w = max(3, self._scale(30))
        v = ctx.speed_series
        v_past = v.rolling(w, min_periods=1).mean()
        v_fut = self._roll_future_mean(v, w, min_p=1)
        return {"spd_asym_1s": (v_fut - v_past).fillna(0.0)}
    
    def _feat_gauss_shift(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Độ lệch Gaussian (KL Divergence) giữa quá khứ và tương lai.
        Đo lường sự thay đổi trạng thái thống kê.
        """
        w = max(5, self._scale(30))
        v = ctx.speed_series
        mu_p = v.rolling(w, min_periods=1).mean()
        va_p = v.rolling(w, min_periods=1).var().clip(lower=1e-6)
        mu_f = self._roll_future_mean(v, w, min_p=1)
        va_f = self._roll_future_var(v, w, min_p=1).clip(lower=1e-6)

        kl_pf = 0.5 * (
            (va_p / va_f) + ((mu_f - mu_p) ** 2) / va_f - 1.0 + np.log(va_f / va_p)
        )
        kl_fp = 0.5 * (
            (va_f / va_p) + ((mu_p - mu_f) ** 2) / va_p - 1.0 + np.log(va_p / va_f)
        )
        return {
            "spd_symkl_1s": (kl_pf + kl_fp).replace([np.inf, -np.inf], np.nan).fillna(0.0)
        }

    def _feat_avoidance_trajectory(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        """
        Tính toán quỹ đạo né tránh:
        1. Relative Heading: Góc di chuyển so với hướng tới đối thủ.
        2. Future Distance Gain: Dự báo xem hành động này có giúp chuột ra xa đối thủ trong tương lai không.
        """
        feats = {}
        if target_ctx is None: 
            return feats

        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
        rel_vec = target_ctx.pos - ctx.pos
        # Góc hướng tới địch (Angle to Target)
        angle_to_target = np.arctan2(rel_vec[:, 1], rel_vec[:, 0])
        
        # Góc di chuyển của Tôi (My Heading)
        my_heading = np.arctan2(ctx.vel[:, 1], ctx.vel[:, 0])
        
        # Độ lệch góc (Absolute Difference)
        # Cần xử lý wrap góc (ví dụ: lệch giữa 179 độ và -179 độ là 2 độ chứ ko phải 358)
        diff = np.abs(angle_to_target - my_heading)
        diff = np.minimum(diff, 2*np.pi - diff) # Chuẩn hóa về [0, pi]
        
        # Feature: Cosine của góc lệch
        # 1.0 (0 độ) -> Lao vào
        # 0.0 (90 độ) -> AVOID (Lách ngang)
        # -1.0 (180 độ) -> Escape
        feats["heading_rel_cos"] = pd.Series(np.cos(diff), index=idx, dtype="float32")
        
        # Feature: Góc lệch tuyệt đối (đổi ra độ cho dễ hình dung nếu cần, ở đây để rad)
        feats["heading_rel_abs"] = pd.Series(diff, index=idx, dtype="float32")


        # --- 2. FUTURE DISTANCE GAIN (Hiệu quả tránh né) ---
        # "Sau 15 frame (0.5s) hoặc 30 frame (1s), mình có xa nó ra không?"
        
        dist_now = np.linalg.norm(rel_vec, axis=1)
        s_dist = pd.Series(dist_now, index=idx)
        
        scales = [15, 30] # 0.5s và 1s
        for w in scales:
            ws = self._scale(w)
            
            # Lấy khoảng cách ở tương lai (shift ngược lên)
            # s.shift(-ws) là giá trị của t + ws
            dist_future = s_dist.shift(-ws)
            gain = dist_future - s_dist
            
            feats[f"dist_gain_{w}f"] = gain.fillna(0.0).astype("float32")

        return feats
    
    def _extract_part(self, ctx: AgentContext, part: str) -> Optional[np.ndarray]:
        if ctx.raw_df is None: return None
        if part not in ctx.raw_df.columns.get_level_values(0): return None
        try:
            sub_df = ctx.raw_df.xs(part, axis=1, level=0)[["x", "y"]].reindex(ctx.idx)
        except KeyError: return None
        raw = sub_df.to_numpy()
        raw = self._forward_fill_nan(raw)
        cm = self._to_cm(raw.astype(np.float32))
        return self._smooth(cm)
    
    def _extract_parts_dict(self, ctx: AgentContext, parts: List[str] = None) -> Dict[str, Optional[np.ndarray]]:
        out = {}
        for p in parts:
            out[p] = self._extract_part(ctx, p)
        return out
        
    def _feat_pose_shape(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Placeholder cho các đặc trưng hình dáng (Elongation, Body Angle...).
        """
        feats = {}

        def zero(): return pd.Series(0.0, index=ctx.idx, dtype="float32")

        def dist(k1, k2):
            p1, p2 = parts.get(k1), parts.get(k2)
            if p1 is None or p2 is None: return zero()
            d = np.linalg.norm(p1 - p2, axis=1)
            return pd.Series(d, index=ctx.idx, dtype="float32")
        
        # def body_angle():
        #     if parts.get("nose") is None: return zero()
        #     if parts.get("neck") is None: return zero()
        #     if parts.get("tail_base") is None: return zero()

        #     v1 = parts.get("nose") - parts.get("neck")
        #     v2 = parts.get("tail_base") - parts.get("neck")
        #     dot_product = np.sum(v1 * v2, axis=1)
        #     mag = np.linalg.norm(v1, axis=1) * np.linalg.norm(v2, axis=1)
        #     cos_angle = np.clip(dot_product / (mag + 1e-6), -1.0, 1.0).astype("float32")
        #     return cos_angle
        
        # def elongation():
        #     if parts.get("nose")          is None: return zero()
        #     if parts.get("tail_base")     is None: return zero()
        #     if parts.get("lateral_left")  is None: return zero()
        #     if parts.get("lateral_right") is None: return zero()

        #     d1 = dist("nose", "tail_base")
        #     d2 = dist("lateral_left", "lateral_right")
        #     elongation = d1 / (d2 + 1e-6).astype("float32")
        #     return elongation

        
        
        def vel(part: str, n_frames_30fps: int) -> Dict:
            part_pos = self._extract_part(ctx, part)
            if part_pos is None: return zero()
            
            s_x = pd.Series(part_pos[:, 0], index=ctx.idx)
            s_y = pd.Series(part_pos[:, 1], index=ctx.idx)
            raw_speed = self._speed_series(s_x, s_y)

            ws = self._scale(n_frames_30fps)
            val = raw_speed.rolling(ws, min_periods=1, center=True).mean()
            return val.astype("float32")


        target_parts = ["nose", "tail_base", 
                        "ear_left", "ear_right", "neck", "hip_left", "hip_right"]
        
        parts = self._extract_parts_dict(ctx, target_parts)

        feats["a_body_width"]                = dist("hip_left", "hip_right")
        # feats["aa_nose_bodycenter_dist"]     = dist("nose", "body_center")
        # feats["aa_nose_tailbase_dist"]       = dist("nose", "tail_base")
        # feats["aa_bodycenter_tailbase_dist"] = dist("body_center", "tail_base")
        
        # feats["aa_bodycenter_ear_l_dist"]    = dist("body_center", "ear_left")
        # feats["aa_bodycenter_ear_r_dist"]    = dist("body_center", "ear_right")
        # feats["aa_bodycenter_lateral_l_dist"]= dist("body_center", "lateral_left")
        # feats["aa_bodycenter_lateral_r_dist"]= dist("body_center", "lateral_right")
        
        # feats["a_body_angle"]                = body_angle()
        # feats["a_elongation"]                = elongation()
        # feats["a_tail_base_vel_500ms"]       = vel("tail_base", 15)
        # feats["a_tail_base_vel_1000ms"]      = vel("tail_base", 30)
        # feats["a_tail_base_vel_2000ms"]      = vel("tail_base", 60)
        # feats["a_tail_base_vel_3000ms"]      = vel("tail_base", 90)
        feats["a_nose_vel_500ms"]            = vel("nose", 15)
        feats["a_nose_vel_1000ms"]           = vel("nose", 30)
        feats["a_nose_vel_2000ms"]           = vel("nose", 60)
        feats["a_nose_vel_3000ms"]           = vel("nose", 90)
        # feats["a_ear_right_vel_500ms"]       = vel("ear_right", 15)
        # feats["a_ear_right_vel_1000ms"]      = vel("ear_right", 30)
        # feats["a_ear_right_vel_2000ms"]      = vel("ear_right", 60)
        # feats["a_ear_right_vel_3000ms"]      = vel("ear_right", 90)

        return feats

    def _feat_shortburst_social(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        """
        Short-burst social features (10–30 frames) đặc biệt cho attack / chase / escape.
        Chỉ dùng được khi có target_ctx.
        """
        feats = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- Lấy lại vài quantity cơ bản từ pairwise/avoidance ---
        # vector Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        rel_dist = np.linalg.norm(rel_vec, axis=1)
        rel_dist_s = pd.Series(rel_dist, index=idx, dtype="float32")
    
        # unit vector
        rel_dist_safe = np.where(rel_dist == 0, 1e-6, rel_dist)
        u_vec = rel_vec / rel_dist_safe[:, None]
    
        # velocity dọc trục nối (approach speed)
        a_vel = ctx.vel
        t_vel = target_ctx.vel
        a_along = np.sum(a_vel * u_vec, axis=1)                # +: lao vào target
        t_along = np.sum(t_vel * (-u_vec), axis=1)             # +: target lao vào agent
        rel_along = np.sum((a_vel - t_vel) * u_vec, axis=1)    # +: lại gần nhau
    
        a_along_s = pd.Series(a_along, index=idx, dtype="float32")
        t_along_s = pd.Series(t_along, index=idx, dtype="float32")
        rel_along_s = pd.Series(rel_along, index=idx, dtype="float32")
    
        # speed agent / target
        a_speed = ctx.speed_series
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32"
        )
    
        # heading_rel_cos ~ escape / approach
        # vector body của agent
        # (reuse idea từ _feat_pairwise)
        # head ~ nose, tail ~ tail_base/body_center
        parts_a = self._extract_parts_dict(ctx, ["nose", "tail_base", "body_center"])
        head_a = parts_a.get("nose")
        tail_a = parts_a.get("tail_base") if parts_a.get("tail_base") is not None else parts_a.get("body_center")
    
        if head_a is not None and tail_a is not None:
            body_vec_a = head_a - tail_a
            dot = np.sum(body_vec_a * rel_vec, axis=1)
            mag = np.linalg.norm(body_vec_a, axis=1) * rel_dist_safe
            heading_cos = np.clip(dot / (mag + 1e-6), -1.0, 1.0)
            heading_cos_s = pd.Series(heading_cos, index=idx, dtype="float32")
        else:
            heading_cos_s = zero()
    
        # --- Rolling window 10, 20, 30 frames (ở fps gốc) ---
        for w30 in [10, 20, 30]:
            ws = self._scale(w30)
            min_p = max(1, ws // 3)
    
            # Attack-like: approach mạnh, khoảng cách giảm nhanh
            feats[f"sb_att_approach_mean_{w30}"] = a_along_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_att_rel_along_mean_{w30}"] = rel_along_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_att_dist_delta_{w30}"] = (rel_dist_s - rel_dist_s.shift(ws)).fillna(0.0)
    
            # Chase-like: agent & target đều nhanh, dist tương đối nhỏ
            feats[f"sb_chase_speed_agent_mean_{w30}"] = a_speed.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_chase_speed_target_mean_{w30}"] = t_speed.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_chase_dist_mean_{w30}"] = rel_dist_s.rolling(ws, min_periods=min_p).mean()
    
            # Escape-like: heading ngược, dist tăng nhanh
            feats[f"sb_esc_heading_cos_mean_{w30}"] = heading_cos_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_esc_dist_gain_{w30}"] = (rel_dist_s.shift(-ws) - rel_dist_s).fillna(0.0)
    
        # clip & fillna
        for k, v in feats.items():
            feats[k] = v.replace([np.inf, -np.inf], np.nan).fillna(0.0).astype("float32")
    
        return feats


    def _feat_follow_pattern(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        """
        Đặc trưng hành vi FOLLOW:
          - Agent ở gần target
          - Cùng hướng (body + velocity)
          - Tốc độ vừa phải
          - Khoảng cách tương đối ổn định trong 0.5–1s
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- 1. CÁC ĐẠI LƯỢNG CƠ BẢN ---
        # Vector Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        rel_dist = np.linalg.norm(rel_vec, axis=1)
        rel_dist_s = pd.Series(rel_dist, index=idx, dtype="float32")
    
        # Speed agent/target
        a_speed = ctx.speed_series.astype("float32")
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32",
        )
    
        # Body vector: nose - tail/body_center
        parts_a = self._extract_parts_dict(ctx, ["nose", "tail_base", "neck"])
        parts_t = self._extract_parts_dict(target_ctx, ["nose", "tail_base", "neck"])
    
        def body_vec(parts_dict):
            head = parts_dict.get("nose")
            tail = parts_dict.get("tail_base")
            if tail is None:
                tail = parts_dict.get("neck")
            if head is None or tail is None:
                return None
            return head - tail
    
        a_body = body_vec(parts_a)
        t_body = body_vec(parts_t)
    
        if a_body is not None and t_body is not None:
            dot_bt = np.sum(a_body * t_body, axis=1)
            mag_bt = np.linalg.norm(a_body, axis=1) * np.linalg.norm(t_body, axis=1)
            cos_body = np.clip(dot_bt / (mag_bt + 1e-6), -1.0, 1.0)
            cos_body_s = pd.Series(cos_body, index=idx, dtype="float32")
        else:
            cos_body_s = zero()
    
        # Velocity hướng
        a_vel = ctx.vel
        t_vel = target_ctx.vel
        a_speed_np = np.linalg.norm(a_vel, axis=1)
        t_speed_np = np.linalg.norm(t_vel, axis=1)
        moving_mask = (a_speed_np > 1e-3) & (t_speed_np > 1e-3)
    
        # cos giữa hướng velocity 2 con
        dot_v = np.sum(a_vel * t_vel, axis=1)
        mag_v = a_speed_np * t_speed_np + 1e-6
        cos_vel = np.zeros_like(dot_v, dtype="float32")
        cos_vel[moving_mask] = np.clip(dot_v[moving_mask] / mag_v[moving_mask], -1.0, 1.0)
        cos_vel_s = pd.Series(cos_vel, index=idx, dtype="float32")
    
        # --- 2. WINDOW NGẮN (FOLLOW LÀ PATTERN DÀI HƠN ATTACK) ---
        for w30 in [15, 30, 60]:   # ~0.5s, 1s, 2s
            ws = self._scale(w30)
            min_p = max(ws // 3, 1)
    
            # Khoảng cách trung bình & độ dao động
            m_dist = rel_dist_s.rolling(ws, min_periods=min_p).mean()
            s_dist = rel_dist_s.rolling(ws, min_periods=min_p).std()
    
            # Cùng hướng (body + velocity)
            m_cos_body = cos_body_s.rolling(ws, min_periods=min_p).mean()
            m_cos_vel  = cos_vel_s.rolling(ws, min_periods=min_p).mean()
    
            # Tốc độ vừa phải
            m_sp_a = a_speed.rolling(ws, min_periods=min_p).mean()
            m_sp_t = t_speed.rolling(ws, min_periods=min_p).mean()
    
            feats[f"follow_dist_mean_{w30}"] = m_dist
            feats[f"follow_dist_std_{w30}"]  = s_dist
            feats[f"follow_cos_body_mean_{w30}"] = m_cos_body
            feats[f"follow_cos_vel_mean_{w30}"]  = m_cos_vel
            feats[f"follow_speed_agent_mean_{w30}"] = m_sp_a
            feats[f"follow_speed_target_mean_{w30}"] = m_sp_t
    
        # Clean
        for k, v in feats.items():
            feats[k] = (
                v.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats


    def _feat_attack_sniff(
        self,
        ctx: AgentContext,
        target_ctx: AgentContext = None,
        **kwargs
    ) -> Dict[str, pd.Series]:
        """
        Đặc trưng phân biệt attack vs sniff cho lab 2-mouse (agent=1, target=2).
    
        Ý tưởng:
          - attack: speed 2 con biến động mạnh, đổi hướng nhiều, body overlap cao.
          - sniff : mũi gần cổ/thân, overlap thấp hơn, motion nhẹ/ổn định hơn.
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
    
        def zero():
            return pd.Series(0.0, index=idx, dtype="float32")
    
        # ---------------------------------------------------------
        # 1) TRÍCH XUẤT CÁC BỘ PHẬN CẦN THIẾT
        # ---------------------------------------------------------
        parts_a = self._extract_parts_dict(
            ctx,
            ["nose", "neck", "ear_left", "ear_right", "hip_left", "hip_right", "tail_base"],
        )
        parts_t = self._extract_parts_dict(
            target_ctx,
            ["nose", "neck", "ear_left", "ear_right", "hip_left", "hip_right", "tail_base"],
        )

        # helper khoảng cách
        def dist(p1, p2):
            if p1 is None or p2 is None:
                return zero()
            d = np.linalg.norm(p1 - p2, axis=1)
            return pd.Series(d, index=idx, dtype="float32")
    
        # ---------------------------------------------------------
        # 2) ĐIỂM ĐẠI DIỆN THÂN (BODY CENTER) CHO MỖI CON
        #    dùng trung bình neck – hips – tail_base
        # ---------------------------------------------------------
        def body_center(parts: Dict[str, Optional[np.ndarray]]):
            arrs = []
            for key in ["neck", "hip_left", "hip_right", "tail_base"]:
                if parts.get(key) is not None:
                    arrs.append(parts[key])
            if not arrs:
                return None
            stack = np.stack(arrs, axis=1)  # [F, K, 2]
            # trung bình theo bộ phận
            bc = np.nanmean(stack, axis=1)
            return bc.astype("float32")
    
        a_center = body_center(parts_a)
        t_center = body_center(parts_t)
    
        if a_center is not None and t_center is not None:
            rel_vec = t_center - a_center
            rel_dist = np.linalg.norm(rel_vec, axis=1)
            feats["as_rel_body_dist"] = pd.Series(rel_dist, index=idx, dtype="float32")
        else:
            feats["as_rel_body_dist"] = zero()
    
        # ---------------------------------------------------------
        # 3) KHOẢNG CÁCH MŨI → PHẦN THÂN TARGET (CHO SNIFF)
        # ---------------------------------------------------------
        # mũi agent tới cổ/hips/tail_base target
        a_nose = parts_a.get("nose")
        t_neck = parts_t.get("neck")
        t_hip_l = parts_t.get("hip_left")
        t_hip_r = parts_t.get("hip_right")
        t_tail  = parts_t.get("tail_base")
    
        feats["as_dist_nose_neck"]   = dist(a_nose, t_neck)
        feats["as_dist_nose_hip_l"]  = dist(a_nose, t_hip_l)
        feats["as_dist_nose_hip_r"]  = dist(a_nose, t_hip_r)
        feats["as_dist_nose_tail"]   = dist(a_nose, t_tail)
    
        # khoảng cách mũi → "trung tâm thân" target
        t_body_c = body_center(parts_t)
        if a_nose is not None and t_body_c is not None:
            feats["as_dist_nose_bodycenter"] = dist(a_nose, t_body_c)
        else:
            feats["as_dist_nose_bodycenter"] = zero()
    
        # ---------------------------------------------------------
        # 4) MỨC ĐỘ “BẠO LỰC”: DAO ĐỘNG TỐC ĐỘ & ĐỔI HƯỚNG
        # ---------------------------------------------------------
        # speed 2 con từ velocity
        a_speed = pd.Series(
            np.linalg.norm(ctx.vel, axis=1),
            index=idx,
            dtype="float32",
        )
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32",
        )

        ws_05 = self._scale(15)  # ~0.5s
        mp_05 = max(ws_05 // 3, 1)
    
        feats["as_a_speed_std_05"] = (
            a_speed.rolling(ws_05, min_periods=mp_05).std().fillna(0.0).astype("float32")
        )
        feats["as_t_speed_std_05"] = (
            t_speed.rolling(ws_05, min_periods=mp_05).std().fillna(0.0).astype("float32")
        )
        feats["as_speed_std_sum_05"] = (
            feats["as_a_speed_std_05"] + feats["as_t_speed_std_05"]
        )
    
        # Đổi hướng (jerk góc) của agent
        a_angle = np.arctan2(ctx.vel[:, 1], ctx.vel[:, 0])
        a_angle_diff = np.abs(np.diff(a_angle))
        a_angle_diff = np.where(
            a_angle_diff > np.pi, 2 * np.pi - a_angle_diff, a_angle_diff
        )
        a_angle_diff = np.concatenate([[0.0], a_angle_diff])
        a_angle_diff_s = pd.Series(a_angle_diff, index=idx, dtype="float32")
    
        feats["as_a_turn_jerk_05"] = (
            a_angle_diff_s.rolling(ws_05, min_periods=mp_05)
            .sum()
            .fillna(0.0)
            .astype("float32")
        )
    
        # ---------------------------------------------------------
        # 5) XẤP XỈ OVERLAP CƠ THỂ (BODY OVERLAP)
        #    dùng bbox từ các bộ phận thân
        # ---------------------------------------------------------
        def build_bbox(parts: Dict[str, Optional[np.ndarray]]):
            # dùng neck, hips, tail_base; nếu thiếu sẽ bỏ qua
            arrs = []
            for k in ["neck", "hip_left", "hip_right", "tail_base"]:
                if parts.get(k) is not None:
                    arrs.append(parts[k])
            if not arrs:
                return None
            stack = np.stack(arrs, axis=1)  # [F, K, 2]
            xs = stack[:, :, 0]
            ys = stack[:, :, 1]
            xmin = np.nanmin(xs, axis=1)
            xmax = np.nanmax(xs, axis=1)
            ymin = np.nanmin(ys, axis=1)
            ymax = np.nanmax(ys, axis=1)
            return np.stack([xmin, ymin, xmax, ymax], axis=1).astype("float32")
    
        def iou_box(box1: np.ndarray, box2: np.ndarray):
            # box: [F, 4] = (xmin, ymin, xmax, ymax)
            x1 = np.maximum(box1[:, 0], box2[:, 0])
            y1 = np.maximum(box1[:, 1], box2[:, 1])
            x2 = np.minimum(box1[:, 2], box2[:, 2])
            y2 = np.minimum(box1[:, 3], box2[:, 3])
    
            inter_w = np.clip(x2 - x1, 0.0, None)
            inter_h = np.clip(y2 - y1, 0.0, None)
            inter = inter_w * inter_h
    
            area1 = (box1[:, 2] - box1[:, 0]) * (box1[:, 3] - box1[:, 1])
            area2 = (box2[:, 2] - box2[:, 0]) * (box2[:, 3] - box2[:, 1])
            union = area1 + area2 - inter + 1e-6
            iou = inter / union
            return iou.astype("float32")

        bbox_a = build_bbox(parts_a)
        bbox_t = build_bbox(parts_t)
        if bbox_a is not None and bbox_t is not None:
            iou = iou_box(bbox_a, bbox_t)
            iou_s = pd.Series(iou, index=idx, dtype="float32")
    
            feats["as_body_iou"] = iou_s
    
            ws_1s = self._scale(30)
            mp_1s = max(ws_1s // 3, 1)
            feats["as_body_iou_mean_1s"] = (
                iou_s.rolling(ws_1s, min_periods=mp_1s).mean().fillna(0.0).astype("float32")
            )
        else:
            feats["as_body_iou"] = zero()
            feats["as_body_iou_mean_1s"] = zero()
    
        # ---------------------------------------------------------
        # 6) DỌN NẠN NaN / Inf
        # ---------------------------------------------------------
        for k, v in feats.items():
            feats[k] = (
                v.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats


   

        

    def _feat_pairwise(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict:
        """
        Đặc trưng tương tác cặp đôi (Pairwise): Khoảng cách, Tốc độ tiếp cận.
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None: 
            return feats

        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")

        # --- 1. KHOẢNG CÁCH CƠ BẢN (DISTANCES) ---
        # Vector nối Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        dist = np.linalg.norm(rel_vec, axis=1)
        feats["rel_dist"] = pd.Series(dist, index=idx, dtype="float32")

        # --- 2. KHOẢNG CÁCH CHI TIẾT (NOSE-TO-PART) ---
        # Lấy các bộ phận quan trọng
        my_parts = self._extract_parts_dict(ctx, ["nose"])
        target_parts = self._extract_parts_dict(target_ctx, 
            ["nose", "tail_base", "ear_left", "ear_right", "neck", "hip_left", "hip_right"])

        def dist_ab(pt_a, pt_b):
            if pt_a is None or pt_b is None: return zero()
            d = np.linalg.norm(pt_a - pt_b, axis=1)
            return pd.Series(d, index=idx, dtype="float32")

        an, tn = my_parts["nose"], target_parts["nose"]
        feats["dist_nose_nose"] = dist_ab(an, tn)
        feats["dist_nose_tail"] = dist_ab(an, target_parts["tail_base"])
        # feats["dist_nose_body"] = dist_ab(an, target_parts["body_center"])
        feats["dist_nose_el"]   = dist_ab(an, target_parts["ear_left"])
        feats["dist_nose_er"]   = dist_ab(an, target_parts["ear_right"])

        # feats["dist_nose_hl"]   = dist_ab(an, target_parts["hip_left"])
        # feats["dist_nose_hr"]   = dist_ab(an, target_parts["hip_right"])
        # feats["dist_nose_ne"]   = dist_ab(an, target_parts["neck"])
        
        # feats["dist_nose_tll"]  = dist_ab(an, target_parts["lateral_left"])
        # feats["dist_nose_tlr"]  = dist_ab(an, target_parts["lateral_right"])
        # feats["dist_nose_tt"]  = dist_ab(an, target_parts["tail_tip"])

        # --- 3. ĐỊNH HƯỚNG & GÓC NHÌN (ORIENTATION & GAZE) ---
        # Helper lấy vector cơ thể (Mũi - Đuôi/Thân)
        def get_body_vec(parts_dict):
            head = parts_dict.get("nose")
            # Ưu tiên đuôi, nếu ko có thì dùng thân
            tail = parts_dict.get("tail_base")
            if tail is None: tail = parts_dict.get("neck") # Fallback
            
            if head is not None and tail is not None:
                return head - tail
            return None

        a_vec = get_body_vec(my_parts)
        t_vec = get_body_vec(target_parts)

        # A. Body Cosine: Hai con cùng chiều hay ngược chiều?
        if a_vec is not None and t_vec is not None:
            dot = np.sum(a_vec * t_vec, axis=1)
            mags = np.linalg.norm(a_vec, axis=1) * np.linalg.norm(t_vec, axis=1)
            feats["body_cosine"] = pd.Series(
                np.clip(dot / (mags + 1e-6), -1.0, 1.0), index=idx, dtype="float32"
            )
        else:
            feats["body_cosine"] = zero()

        # B. Gaze Cosine: Tôi có đang nhìn về phía Target không?
        # Vector ánh nhìn = Target_Pos - My_Pos = rel_vec
        if a_vec is not None:
            dot_gaze = np.sum(a_vec * rel_vec, axis=1)
            mag_a = np.linalg.norm(a_vec, axis=1)
            # dist đã tính ở bước 1
            feats["gaze_cosine"] = pd.Series(
                np.clip(dot_gaze / (mag_a * dist + 1e-6), -1.0, 1.0),
                index=idx, dtype="float32"
            )
        else:
            feats["gaze_cosine"] = zero()

        # --- 4. PHÂN RÃ VẬN TỐC (VELOCITY DECOMPOSITION) - CHÌA KHÓA CHO AVOID/ESCAPE ---
        # Vector đơn vị hướng về địch (u)
        dist_safe = dist.copy()
        dist_safe[dist_safe == 0] = 1e-6
        u_vec = rel_vec / dist_safe[:, None]

        # a_vel và t_vel lấy từ Context
        a_vel, t_vel = ctx.vel, target_ctx.vel

        # A. Approach Speed (Vận tốc dọc trục nối 2 con)
        # Dương: Lao vào nhau | Âm: Chạy ra xa nhau
        a_along = np.sum(a_vel * u_vec, axis=1)
        t_along = np.sum(t_vel * (-u_vec), axis=1) # Target hướng ngược lại
        rel_along = np.sum((a_vel - t_vel) * u_vec, axis=1)

        # B. Lateral Speed (Vận tốc ngang - Vuông góc trục nối)
        # Vector chiếu: v_proj = (v . u) * u
        a_proj = a_along[:, None] * u_vec
        a_lat_vec = a_vel - a_proj
        a_lat_speed = np.linalg.norm(a_lat_vec, axis=1)

        feats["approach_speed_agent"]  = pd.Series(a_along, index=idx, dtype="float32")
        feats["approach_speed_target"] = pd.Series(t_along, index=idx, dtype="float32")
        feats["approach_speed_rel"]    = pd.Series(rel_along, index=idx, dtype="float32")
        feats["lateral_speed_agent"]   = pd.Series(a_lat_speed, index=idx, dtype="float32")
        return feats
        

    # --- Methods tương thích ---
    
    def build_pose_tensor(self, tracking: pd.DataFrame):
        """
        Chuyển dữ liệu tracking (DataFrame) sang Tensor [Frames, Mice, 2] và Dict chi tiết.
        """
        tracking = tracking.sort_values("video_frame")
        frames = np.sort(tracking["video_frame"].unique())
        
        pvid = tracking.pivot(
            index="video_frame", 
            columns=["mouse_id", "bodypart"], 
            values=["x", "y"]
        )
        pvid = pvid.reorder_levels([1, 2, 0], axis=1).sort_index(axis=1).astype("float32")
        mouse_ids = list(pvid.columns.get_level_values(0).unique())
        pos = np.full((len(frames), len(mouse_ids), 2), np.nan, dtype=np.float32)
        per_mouse_df = {}
        
        for i, mid in enumerate(mouse_ids):
            single = pvid[mid]
            per_mouse_df[mid] = single
            
            if "body_center" in single.columns.get_level_values(0):
                cx = single["body_center"]["x"]
                cy = single["body_center"]["y"]
            else:
                cx = single.xs("x", level=1, axis=1).mean(axis=1)
                cy = single.xs("y", level=1, axis=1).mean(axis=1)
            
            pos[:, i, 0] = cx.reindex(frames).values
            pos[:, i, 1] = cy.reindex(frames).values
            
        return frames, mouse_ids, pos, per_mouse_df

    def extract_agent_target(
        self, 
        frames: np.ndarray, 
        mouse_ids: List[Any], 
        pos: np.ndarray, 
        agent_id: Any, 
        target_id: Any, 
        per_mouse_df: Dict = None
    ) -> pd.DataFrame:
        """
        Trích xuất đặc trưng cho cặp (Agent, Target).
        """
        try:
            aid_idx = mouse_ids.index(agent_id)
        except ValueError:
            return pd.DataFrame() 

        # 1. Build Agent Context
        ctx_agent = self._build_context(
            frames, 
            pos[:, aid_idx, :], 
            per_mouse_df.get(agent_id) if per_mouse_df else None
        )

        # 2. Build Target Context
        ctx_target = None
        if self.cfg.use_pairwise and target_id is not None and target_id in mouse_ids:
             tid_idx = mouse_ids.index(target_id)
             ctx_target = self._build_context(
                 frames, 
                 pos[:, tid_idx, :], 
                 per_mouse_df.get(target_id) if per_mouse_df else None
             )

        # 3. Run all features
        all_data = {}
        for func_name, func in self.feature_registry.items():
            out_dict = func(ctx_agent, target_ctx=ctx_target)
            all_data.update(out_dict)

        df_out = pd.DataFrame(all_data, index=ctx_agent.idx)
        df_out = df_out.replace([np.inf, -np.inf], np.nan).fillna(0.0)
        
        return df_out.reindex(sorted(df_out.columns), axis=1)

#======================================================================================
#======================================================================================
#======================================================================================



from __future__ import annotations
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import gc
import itertools
import json
import time
from collections import defaultdict

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedGroupKFold

# (Trên Kaggle) dùng metric chính thức
import sys
sys.path.append("/kaggle/usr/lib/mabe-f-beta")
from metric import score   # hàm score(submission_df, dataset_df)

# =========================================================
# 1. ĐƯỜNG DẪN & CẤU HÌNH
# =========================================================

INPUT_DIR = Path("/kaggle/input/MABe-mouse-behavior-detection")
TRAIN_TRACKING_DIR = INPUT_DIR / "train_tracking"
TRAIN_ANNOTATION_DIR = INPUT_DIR / "train_annotation"
TEST_TRACKING_DIR = INPUT_DIR / "test_tracking"


WORKING_DIR = Path("/kaggle/working")
RESULTS_DIR = Path(r"/kaggle/input/results-xgb-fe")
RESULTS_DIR.mkdir(exist_ok=True, parents=True)

INDEX_COLS = ["video_id", "agent_id", "target_id", "video_frame"]

# hành vi “self” vs “pair” giống notebook (có thể chỉnh nếu muốn)
SELF_BEHAVIORS = [
    "biteobject", "climb", "dig", "exploreobject", "freeze",
    "genitalgroom", "huddle", "rear", "rest", "run", "selfgroom",
]
PAIR_BEHAVIORS = [
    "allogroom", "approach", "attack", "attemptmount", "avoid",
    "chase", "chaseattack", "defend", "disengage", "dominance",
    "dominancegroom", "dominancemount", "ejaculate", "escape",
    "flinch", "follow", "intromit", "mount", "reciprocalsniff",
    "shepherd", "sniff", "sniffbody", "sniffface", "sniffgenital",
    "submit", "tussle",
]


# =========================================================
# 2. ĐỌC METADATA & HELPER
# =========================================================

def load_metadata() -> pd.DataFrame:
    train_meta = pd.read_csv(INPUT_DIR / "train.csv")
    return train_meta


def get_video_params(video_id: Any, meta: pd.DataFrame) -> Tuple[float, float]:
    """Lấy fps, pix_per_cm cho video từ train.csv."""
    row = meta.loc[meta["video_id"] == video_id]
    if row.empty:
        raise KeyError(f"video_id={video_id} không có trong train.csv")
    row = row.iloc[0]

    # giống notebook: cột "frames per second" & "pix per cm (approx)"
    fps = float(row["frames_per_second"])
    pix_per_cm = float(row["pix_per_cm_approx"])
    if not np.isfinite(pix_per_cm) or pix_per_cm <= 0:
        pix_per_cm = 1.0
    return fps, pix_per_cm


def load_tracking(lab_id: str, video_id: Any) -> pd.DataFrame:
    """Đọc tracking parquet → pandas (schema: video_frame, mouse_id, bodypart, x, y)."""
    path = TRAIN_TRACKING_DIR / str(lab_id) / f"{video_id}.parquet"
    if not path.exists():
        raise FileNotFoundError(path)
    df = pd.read_parquet(path)
    return df

def load_tracking_test(lab_id: str, video_id: Any) -> pd.DataFrame:
    """Đọc tracking parquet của test → pandas."""
    path = INPUT_DIR / "test_tracking" / str(lab_id) / f"{video_id}.parquet"
    if not path.exists():
        raise FileNotFoundError(path)
    return pd.read_parquet(path)


def load_annotation(lab_id: str, video_id: Any) -> pd.DataFrame:
    """Đọc annotation (agent_id, target_id, action, start_frame, stop_frame)."""
    path = TRAIN_ANNOTATION_DIR / str(lab_id) / f"{video_id}.parquet"
    if not path.exists():
        # không có label cho video này
        return pd.DataFrame(
            columns=["agent_id", "target_id", "action", "start_frame", "stop_frame"]
        )
    ann = pd.read_parquet(path)
    return ann[["agent_id", "target_id", "action", "start_frame", "stop_frame"]]


# =========================================================
# 3. TÍNH FEATURE PER-FRAME BẰNG FEATUREEXTRACTOR
# =========================================================

# Cache: (lab, video, agent, target) -> (frames, feature_df)
_feature_cache: Dict[Tuple[str, int, int, int], Tuple[np.ndarray, pd.DataFrame]] = {}


def get_frame_features_for_pair(
    lab_id: str,
    video_id: int,
    agent_id: int,
    target_id: int,
    meta: pd.DataFrame,
) -> Tuple[np.ndarray, pd.DataFrame]:
    """
    Tính (hoặc lấy cache) feature per-frame cho 1 video + (agent, target).
    Trả về: frames [F], features_df [F, D]
    """
    key = (str(lab_id), int(video_id), int(agent_id), int(target_id))
    if key in _feature_cache:
        return _feature_cache[key]

    fps, pix_per_cm = get_video_params(video_id, meta)
    tracking = load_tracking(lab_id, video_id)

    fe = FeatureExtractor(
        fps=fps,
        pix_per_cm=pix_per_cm,
        smooth_sigma=1.0,
        use_pairwise=True,
    )

    frames, mouse_ids, pos, per_mouse_df = fe.build_pose_tensor(tracking)

    # agent/target có thể là cùng chuột (self) hoặc khác chuột (pair)
    features_df: pd.DataFrame = fe.extract_agent_target(
        frames=frames,
        mouse_ids=mouse_ids,
        pos=pos,
        agent_id=agent_id,
        target_id=target_id,
        per_mouse_df=per_mouse_df,
    )
    # index chính là frame
    features_df.index = frames

    _feature_cache[key] = (frames, features_df)
    return frames, features_df

_feature_cache: Dict[Tuple[str, int, Any, Any], Tuple[np.ndarray, pd.DataFrame]] = {}

def get_frame_features_for_pair_test(
    lab_id: str,
    video_id: int,
    agent_id: Any,
    target_id: Any,
    test_meta: pd.DataFrame,
) -> Tuple[np.ndarray, pd.DataFrame]:
    """
    Feature per-frame cho test (video_id, agent, target).
    Trả về: frames [F], features_df [F, D]
    """
    key = (f"test_{lab_id}", int(video_id), agent_id, target_id)
    if key in _feature_cache:
        return _feature_cache[key]

    # Lấy fps, pix_per_cm_approx từ test.csv
    row = test_meta[test_meta["video_id"] == video_id].iloc[0]
    fps = float(row["frames_per_second"])
    pix_per_cm = float(row["pix_per_cm_approx"])
    if not np.isfinite(pix_per_cm) or pix_per_cm <= 0:
        pix_per_cm = 1.0

    tracking = load_tracking_test(lab_id, video_id)

    fe = FeatureExtractor(
        fps=fps,
        pix_per_cm=pix_per_cm,
        smooth_sigma=1.0,
        use_pairwise=True,
    )

    frames, mouse_ids, pos, per_mouse_df = fe.build_pose_tensor(tracking)

    features_df = fe.extract_agent_target(
        frames=frames,
        mouse_ids=mouse_ids,
        pos=pos,
        agent_id=agent_id,
        target_id=target_id,
        per_mouse_df=per_mouse_df,
    )
    features_df.index = frames

    _feature_cache[key] = (frames, features_df)
    return frames, features_df



# =========================================================
# 4. BUILD FRAME-LEVEL DATASET CHO 1 (lab_id, behavior)
# =========================================================

def build_frame_dataset_for_lab_behavior(
    lab_id: str,
    behavior: str,
    train_meta: pd.DataFrame,
    mode: str = "self",
) -> Tuple[pd.DataFrame, pd.DataFrame, np.ndarray]:
    """
    Xây tập frame-level (indices, features, labels) cho 1 (lab, behavior).

    indices: DataFrame với cột INDEX_COLS
    features: DataFrame per-frame features
    labels: np.ndarray nhị phân (0/1)
    """

    videos = (
        train_meta[train_meta["lab_id"] == lab_id]["video_id"]
        .unique()
        .tolist()
    )

    index_list = []
    feature_list = []
    label_list = []

    for video_id in videos:
        ann = load_annotation(lab_id, video_id)
        if ann.empty:
            continue

        # chỉ lấy annotation của behavior này
        ann_bhv = ann[ann["action"] == behavior]
        if ann_bhv.empty:
            continue

        # các (agent, target) cần xem
        pairs = ann_bhv[["agent_id", "target_id"]].drop_duplicates().values.tolist()
        for (agent_id, target_id) in pairs:
            if mode == "self":
                target_id_use = agent_id
            else:
                target_id_use = target_id

            frames, feat_df = get_frame_features_for_pair(
                lab_id=lab_id,
                video_id=video_id,
                agent_id=agent_id,
                target_id=target_id_use,
                meta=train_meta,
            )

            # label per-frame: frame ∈ bất kỳ [start, stop) của (agent,target,behavior)
            ann_pair = ann_bhv[
                (ann_bhv["agent_id"] == agent_id)
                & (ann_bhv["target_id"] == target_id)
            ]
            if ann_pair.empty and mode == "self":
                ann_pair = ann_bhv[ann_bhv["agent_id"] == agent_id]

            pos_frames = set()
            for _, r in ann_pair.iterrows():
                pos_frames.update(range(int(r["start_frame"]), int(r["stop_frame"])))

            if len(pos_frames) == 0:
                continue

            label = np.isin(frames, list(pos_frames)).astype("int8")
            if label.sum() == 0:
                continue

            idx_df = pd.DataFrame(
                {
                    "video_id": video_id,
                    "agent_id": agent_id,
                    "target_id": target_id,
                    "video_frame": frames,
                }
            )

            index_list.append(idx_df)
            feature_list.append(feat_df.reset_index(drop=True))
            label_list.append(label)

    if not index_list:
        return (
            pd.DataFrame(columns=INDEX_COLS),
            pd.DataFrame(),
            np.zeros(0, dtype="int8"),
        )

    indices = pd.concat(index_list, ignore_index=True)
    features = pd.concat(feature_list, ignore_index=True)
    labels = np.concatenate(label_list).astype("int8")

    assert len(indices) == len(features) == len(labels)

    return indices, features, labels


# =========================================================
# 5. TRAIN + OOF CHO 1 (lab_id, behavior)
# =========================================================

def tune_threshold(oof_pred: np.ndarray, y: np.ndarray) -> float:
    ths = np.arange(0.0, 1.005, 0.005)
    scores = [f1_score(y, (oof_pred >= th), zero_division=0) for th in ths]
    return float(ths[int(np.argmax(scores))])

#
def train_validate_one(
    lab_id: str,
    behavior: str,
    indices: pd.DataFrame,
    features: pd.DataFrame,
    labels: np.ndarray,
) -> float:
    """
    Train XGBoost binary cho 1 (lab, behavior) + lưu OOF prediction.
    Trả về: F1 trên toàn bộ OOF (frame-level).
    """
    result_dir = RESULTS_DIR / lab_id / behavior
    result_dir.mkdir(parents=True, exist_ok=True)

    n = len(labels)

    if n == 0 or labels.sum() == 0:
        oof_df = indices.copy()
        oof_df["fold"] = -1
        oof_df["prediction"] = 0.0
        oof_df["predicted_label"] = 0
        oof_df.to_parquet(result_dir / "oof_predictions.parquet", index=False)
        (result_dir / "f1.txt").write_text("0.0\n")
        return 0.0

    X = features.values.astype("float32")
    y = labels.astype("int8")
    groups = indices["video_id"].values

    folds = np.ones(n, dtype="int8") * -1
    oof_pred = np.zeros(n, dtype="float32")
    oof_label = np.zeros(n, dtype="int8")

    cv = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=42)

    for fold, (tr_idx, va_idx) in enumerate(cv.split(X, y, groups=groups)):
        fold_dir = result_dir / f"fold_{fold}"
        fold_dir.mkdir(parents=True, exist_ok=True)

        X_tr, y_tr = X[tr_idx], y[tr_idx]
        X_va, y_va = X[va_idx], y[va_idx]

        # scale_pos_weight
        pos = y_tr.sum()
        neg = len(y_tr) - pos
        scale_pos_weight = float(neg / pos) if pos > 0 else 1.0

        params = {
            "objective": "binary:logistic",
            "eval_metric": "logloss",
            "device": "cuda",
            "tree_method": "hist",
            "learning_rate": 0.05,
            "max_depth": 6,
            "min_child_weight": 5,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "scale_pos_weight": scale_pos_weight,
            "max_bin": 64,
            "seed": 42,
        }

        dtrain = xgb.QuantileDMatrix(
            X_tr,
            label=y_tr,
            feature_names=features.columns.tolist(),
            max_bin=64,
        )
        dvalid = xgb.DMatrix(
            X_va,
            label=y_va,
            feature_names=features.columns.tolist(),
        )

        evals_result: Dict[str, Dict[str, List[float]]] = {}

        early_stop = xgb.callback.EarlyStopping(
            rounds=10, metric_name="logloss", data_name="valid", maximize=False
        )

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=250,
            evals=[(dtrain, "train"), (dvalid, "valid")],
            callbacks=[early_stop],
            evals_result=evals_result,
            verbose_eval=False,
        )

        pred_va = model.predict(dvalid)
        th = tune_threshold(pred_va, y_va)

        folds[va_idx] = fold
        oof_pred[va_idx] = pred_va
        oof_label[va_idx] = (pred_va >= th).astype("int8")

        model.save_model(fold_dir / "model.json")
        with open(fold_dir / "threshold.txt", "w") as f:
            f.write(f"{th}\n")

    # lưu OOF
    oof_df = indices.copy()
    oof_df["fold"] = folds
    oof_df["prediction"] = oof_pred
    oof_df["predicted_label"] = oof_label
    oof_df.to_parquet(result_dir / "oof_predictions.parquet", index=False)

    f1 = f1_score(y, oof_label, zero_division=0)
    (result_dir / "f1.txt").write_text(f"{f1:.6f}\n")
    return float(f1)

def load_models_for_behavior_infer(lab_id: str, behavior: str):
    """
    Đọc các fold model + threshold cho (lab, behavior) từ RESULTS_DIR.
    Dùng cho inference (test).
    """
    base_dir = RESULTS_DIR / lab_id / behavior
    if not base_dir.exists():
        return []

    models = []
    for fold_dir in sorted(base_dir.glob("fold_*")):
        model_file = fold_dir / "model.json"
        thr_file = fold_dir / "threshold.txt"
        if not model_file.exists():
            continue

        booster = xgb.Booster()
        booster.load_model(str(model_file))

        if thr_file.exists():
            thr = float(thr_file.read_text().strip())
        else:
            thr = 0.5

        models.append((booster, thr))

    return models


# =========================================================
# 6. LOOP QUA TẤT CẢ BEHAVIORS TRONG 1 LAB
#    (train_all_labs_behaviors vẫn giữ nguyên, nhưng main
#     sẽ filter train_meta chỉ còn 1 lab)
# =========================================================

def train_all_labs_behaviors(train_meta: pd.DataFrame):
    """
    Loop qua từng lab trong train_meta (ở đây main đã filter chỉ còn 1 lab):
      - đọc annotation của tất cả video
      - lấy unique action xuất hiện trong lab đó
      - train 1 model/frame-level cho từng (lab, action)
    """
    labs = train_meta["lab_id"].unique().tolist()

    start_time = time.perf_counter()

    for lab_id in labs:
        # tập video của lab này
        videos = train_meta[train_meta["lab_id"] == lab_id]["video_id"].unique().tolist()

        # gom toàn bộ action thực sự có trong annotation của lab này
        behaviors_set = set()
        for vid in videos:
            ann = load_annotation(lab_id, vid)
            if ann.empty:
                continue
            behaviors_set.update(ann["action"].unique().tolist())

        behaviors = sorted(behaviors_set)
        print(f"\n===== LAB {lab_id}: {len(behaviors)} behaviors =====")

        for behavior in behaviors:
            # if behavior != "submit": continue

            mode = "self" if behavior in SELF_BEHAVIORS else "pair"

            print(f"\n=== LAB={lab_id} | behavior={behavior} | mode={mode} ===")
            indices, features, labels = build_frame_dataset_for_lab_behavior(
                lab_id=str(lab_id),
                behavior=behavior,
                train_meta=train_meta,
                mode=mode,
            )
            print(
                f"frames: {len(labels):,}, positives: {labels.sum():,}, features: "
                f"{features.shape[1] if not features.empty else 0}"
            )

            if len(labels) == 0:
                print(" -> skip (no samples)")
                continue

            f1 = train_validate_one(str(lab_id), behavior, indices, features, labels)
            elapsed = time.perf_counter() - start_time
            print(f" -> OOF F1 (frame-level): {f1:.3f} | elapsed={elapsed/60:.1f} min")



# =========================================================
# 7. GOM OOF PREDICTION → SEGMENT & TÍNH SCORE()
# =========================================================

def build_oof_submission_from_parquet(
    target_lab_id: Optional[str] = None,
) -> pd.DataFrame:
    """
    Đọc tất cả oof_predictions.parquet trong RESULTS_DIR,
    gom thành frame-level table rồi nối thành segment-level prediction
    giống inference notebook (simplified).

    Nếu target_lab_id != None thì chỉ lấy OOF của lab đó
    (vd "AdaptableSnail").
    """
    oof_files = list(RESULTS_DIR.glob("*/**/oof_predictions.parquet"))
    if not oof_files:
        raise RuntimeError("Không tìm thấy OOF parquet, hãy train trước.")

    frame_preds = []

    for path in oof_files:
        # path: results_xgb_fe/lab/behavior/oof_predictions.parquet
        parts = path.parts
        behavior = parts[-2]
        lab_id = parts[-3]

        # chỉ lấy file thuộc lab mong muốn (nếu có)
        if target_lab_id is not None and lab_id != target_lab_id:
            continue

        df = pd.read_parquet(path)
        df = df[INDEX_COLS + ["prediction"]].copy()
        df["lab_id"] = lab_id
        df["action"] = behavior
        frame_preds.append(df)

    if not frame_preds:
        raise RuntimeError(
            f"Không có OOF predictions nào cho lab_id={target_lab_id}"
        )

    frame_df = pd.concat(frame_preds, ignore_index=True)

    # sắp xếp
    frame_df = frame_df.sort_values(
        ["lab_id", "video_id", "agent_id", "target_id", "action", "video_frame"]
    ).reset_index(drop=True)

    # Convert frame-level prob -> hard label + segments
    segments = []
    for (lab_id, video_id, agent_id, target_id, action), group in frame_df.groupby(
        ["lab_id", "video_id", "agent_id", "target_id", "action"], sort=False
    ):
        frames = group["video_frame"].values
        scores = group["prediction"].values

        # dùng một threshold fix (vd 0.5) cho demo
        # (hoặc bạn có thể lưu threshold per (lab,behavior) và apply)
        hard = scores >= 0.5

        in_seg = False
        start = None
        prev_f = None

        for f, h in zip(frames, hard):
            if h and not in_seg:
                in_seg = True
                start = int(f)
            elif (not h) and in_seg:
                stop = int(prev_f + 1)  # [start, stop)
                segments.append(
                    {
                        "lab_id": lab_id,
                        "video_id": int(video_id),
                        "agent_id": int(agent_id),
                        "target_id": int(target_id),
                        "action": action,
                        "start_frame": start,
                        "stop_frame": stop,
                    }
                )
                in_seg = False
            prev_f = f

        if in_seg:
            stop = int(frames[-1] + 1)
            segments.append(
                {
                    "lab_id": lab_id,
                    "video_id": int(video_id),
                    "agent_id": int(agent_id),
                    "target_id": int(target_id),
                    "action": action,
                    "start_frame": start,
                    "stop_frame": stop,
                }
            )

    if not segments:
        return pd.DataFrame(
            columns=[
                "lab_id",
                "video_id",
                "agent_id",
                "target_id",
                "action",
                "start_frame",
                "stop_frame",
            ]
        )

    submission = pd.DataFrame(segments)
    submission = submission.sort_values(
        ["lab_id", "video_id", "agent_id", "target_id", "action", "start_frame"]
    ).reset_index(drop=True)

    return submission

BAD_VIDEOS = []

def compute_validation_score(
    submission: pd.DataFrame,
    lab_id: Optional[str] = None,
) -> float:
    """
    Gọi metric `score()` chính thức trên train set.
    Nếu lab_id != None → chỉ validate trên lab đó.
    """
    # ===== THAY ĐỔI Ở ĐÂY =====
    # Không dùng train.csv, mà phải đọc toàn bộ annotations
    train_meta = pd.read_csv(INPUT_DIR / "train.csv")
    
    if lab_id is not None:
        train_meta = train_meta[train_meta["lab_id"] == lab_id].reset_index(drop=True)

    if BAD_VIDEOS:
        train_meta = train_meta[~train_meta["video_id"].isin(BAD_VIDEOS)]
    
    # Đọc tất cả annotation files
    all_annotations = []
    for _, row in train_meta.iterrows():
        lab = row["lab_id"]
        vid = row["video_id"]
        ann = load_annotation(lab, vid)
        if not ann.empty:
            ann["lab_id"] = lab
            ann["video_id"] = vid
            ann["behaviors_labeled"] = row["behaviors_labeled"]
            all_annotations.append(ann)
    
    if not all_annotations:
        print("Không có annotation nào để validate!")
        return 0.0
    
    dataset = pd.concat(all_annotations, ignore_index=True)
    
    # Filter submission theo lab nếu cần
    if lab_id is not None:
        submission = submission[submission["lab_id"] == lab_id].reset_index(drop=True)
    
    # ===== GỌI METRIC =====
    s = score(dataset, submission, row_id_column_name="row_id")

    print(
        f"Official validation score"
        f"{' (lab=' + lab_id + ')' if lab_id is not None else ''}: {s:.6f}"
    )
    return float(s)



# =========================================================
# 8. MAIN
# =========================================================
def str_to_mouse_id(s: str) -> int:
    if s == "self":
        return -1
    return int(str(s).replace("mouse", ""))


def predict_behaviors_for_pair(
    lab_id: str,
    video_id: int,
    agent_internal_id: Any,
    target_internal_id: Any,
    behaviors: List[str],
    test_meta: pd.DataFrame,
) -> pd.DataFrame:
    """
    Chạy inference cho 1 cặp (video, agent_internal_id, target_internal_id)
    với list behaviors (cùng mode: all self hoặc all pair).
    Trả về segment-level DataFrame: video_id, action, start_frame, stop_frame.
    """
    if lab_id != "JovialSwallow": return None
    frames, feat_df = get_frame_features_for_pair_test(
        lab_id=lab_id,
        video_id=video_id,
        agent_id=agent_internal_id,
        target_id=target_internal_id,
        test_meta=test_meta,
    )
    if feat_df.empty:
        return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])

    feat_df = feat_df.astype("float32")
    n_frames = len(feat_df)

    scores_per_behavior = {}
    for behavior in behaviors:
        models = load_models_for_behavior_infer(lab_id, behavior)
        if not models:
            continue

        req_feats = models[0][0].feature_names
        # Build X_test với đúng bộ feature của model
        X_test = pd.DataFrame(
            0.0,
            index=feat_df.index,
            columns=req_feats,
            dtype=np.float32,
        )
        common = list(set(req_feats) & set(feat_df.columns))
        if common:
            X_test[common] = feat_df[common]

        dtest = xgb.DMatrix(X_test, feature_names=req_feats)

        agg_scores = np.zeros(n_frames, dtype=np.float32)
        for booster, thr in models:
            probs = booster.predict(dtest)
            labels = (probs >= thr).astype(np.int8)
            agg_scores += probs * labels

        agg_scores /= max(len(models), 1)
        scores_per_behavior[behavior] = agg_scores

        del dtest, X_test
        gc.collect()

    if not scores_per_behavior:
        return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])

    beh_list = list(scores_per_behavior.keys())
    score_mat = np.vstack([scores_per_behavior[b] for b in beh_list]).T  # [F, B]

    max_idx = score_mat.argmax(axis=1)
    max_scores = score_mat.max(axis=1)
    labels = np.where(max_scores == 0.0, "none", np.array(beh_list)[max_idx])

    # frame-level → segment
    segments = []
    prev_lab = "none"
    prev_start = None
    prev_f = None

    for f, lab in zip(frames, labels):
        if lab != prev_lab:
            if prev_lab != "none":
                segments.append(
                    {
                        "video_id": int(video_id),
                        "action": prev_lab,
                        "start_frame": int(prev_start),
                        "stop_frame": int(prev_f + 1),
                    }
                )
            prev_lab = lab
            prev_start = f
        prev_f = f

    if prev_lab != "none":
        segments.append(
            {
                "video_id": int(video_id),
                "action": prev_lab,
                "start_frame": int(prev_start),
                "stop_frame": int(prev_f + 1),
            }
        )

    if not segments:
        return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])

    return pd.DataFrame(segments)



target_lab = "JovialSwallow"
print(f"Đọc test.csv cho lab {target_lab} ...")
test_meta = pd.read_csv(INPUT_DIR / "test.csv")
test_meta = test_meta[test_meta["lab_id"] == target_lab].reset_index(drop=True)

# Lấy danh sách behavior đã train (thư mục con trong RESULTS_DIR/AdaptableSnail)
lab_result_dir = RESULTS_DIR / target_lab
if lab_result_dir.exists():
    trained_behaviors = sorted(
        [p.name for p in lab_result_dir.iterdir() if p.is_dir()]
    )
else:
    trained_behaviors = []

self_behaviors_in_lab = [b for b in trained_behaviors if b in SELF_BEHAVIORS]
pair_behaviors_in_lab = [b for b in trained_behaviors if b in PAIR_BEHAVIORS]

print("Behaviors (self) dùng để predict:", self_behaviors_in_lab)
print("Behaviors (pair) dùng để predict:", pair_behaviors_in_lab)

all_segments = []

# Loop từng video test của lab
for video_id in sorted(test_meta["video_id"].unique()):
    print(f"Predict video_id={video_id} ...")

    tracking = load_tracking_test(target_lab, video_id)
    mouse_ids_internal = sorted(tracking["mouse_id"].unique().tolist())

    # Map internal mouse_id -> string để đưa vào submission
    def to_submit_id(mid):
        s = str(mid)
        return s if s.startswith("mouse") else f"mouse{s}"

    # SELF behaviors: agent == target (self)
    if self_behaviors_in_lab:
        for mid in mouse_ids_internal:
            seg_df = predict_behaviors_for_pair(
                lab_id=target_lab,
                video_id=video_id,
                agent_internal_id=mid,
                target_internal_id=mid,  # self
                behaviors=self_behaviors_in_lab,
                test_meta=test_meta,
            )
            if not seg_df.empty:
                seg_df["agent_id"] = to_submit_id(mid)
                seg_df["target_id"] = "self"
                all_segments.append(seg_df)

    # PAIR behaviors: mọi cặp agent != target
    if pair_behaviors_in_lab and len(mouse_ids_internal) > 1:
        for agent_internal, target_internal in itertools.permutations(
            mouse_ids_internal, 2
        ):
            seg_df = predict_behaviors_for_pair(
                lab_id=target_lab,
                video_id=video_id,
                agent_internal_id=agent_internal,
                target_internal_id=target_internal,
                behaviors=pair_behaviors_in_lab,
                test_meta=test_meta,
            )
            if not seg_df.empty:
                seg_df["agent_id"] = to_submit_id(agent_internal)
                seg_df["target_id"] = to_submit_id(target_internal)
                all_segments.append(seg_df)

# Gộp tất cả segments → submission.csv
# Gộp tất cả segments → submission2.csv
if all_segments:
    submission5 = pd.concat(all_segments, ignore_index=True)
    submission5 = submission5[
        ["video_id", "agent_id", "target_id", "action", "start_frame", "stop_frame"]
    ]
    submission5 = submission5.sort_values(
        ["video_id", "agent_id", "target_id", "action", "start_frame"]
    ).reset_index(drop=True)
else:
    # DataFrame rỗng, KHÔNG dummy row
    submission5 = pd.DataFrame(
        columns=[
            "video_id",
            "agent_id",
            "target_id",
            "action",
            "start_frame",
            "stop_frame",
        ]
    )

# Thêm row_id (kể cả khi rỗng)
submission5.insert(0, "row_id", np.arange(len(submission5), dtype=np.int64))

sub_path = WORKING_DIR / "submission5.csv"
submission5.to_csv(sub_path, index=False)
print(f"Saved JovialSwallow submission to {sub_path}")


Đọc test.csv cho lab JovialSwallow ...
Behaviors (self) dùng để predict: []
Behaviors (pair) dùng để predict: ['attack', 'chase', 'sniff']
Saved JovialSwallow submission to /kaggle/working/submission5.csv


# PleasantMeerkat

In [10]:
from pathlib import Path
import shutil
import gc

WORKING_DIR = Path("/kaggle/working")

# 1) Xóa mọi thứ trong /kaggle/working trừ .csv
for path in WORKING_DIR.iterdir():
    # giữ lại file .csv
    if path.is_file() and path.suffix == ".csv":
        continue

    if path.is_file():
        try:
            path.unlink()
        except Exception as e:
            print(f"Cannot remove file {path}: {e}")
    elif path.is_dir():
        try:
            shutil.rmtree(path, ignore_errors=True)
        except Exception as e:
            print(f"Cannot remove dir {path}: {e}")


gc.collect()

77

In [11]:
from __future__ import annotations
from typing import Dict, List, Tuple, Any, Optional
import warnings
from dataclasses import dataclass, field
import numpy as np
import pandas as pd
from scipy.ndimage import gaussian_filter1d
from tqdm import tqdm

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
np.seterr(invalid="ignore", divide="ignore")

# =============================================================================
# 1. CONFIGURATION
# =============================================================================
@dataclass
class FeatureConfig:
    """
    Chứa cấu hình tham số (Hyperparameters).
    """
    fps: float = 30.0
    pix_per_cm: float = 1.0
    smooth_sigma: float = 1.0
    use_pairwise: bool = True


# =============================================================================
# 2. AGENT CONTEXT
# =============================================================================
@dataclass
class AgentContext:
    """
    Container chứa dữ liệu đã tiền xử lý của một con chuột.
    Giúp tránh việc tính toán lại vận tốc/gia tốc nhiều lần.
    """
    idx: pd.Index          # Index frame
    pos: np.ndarray        # [F, 2] cm
    vel: np.ndarray        # [F, 2] cm/s
    speed: np.ndarray      # [F, 1] cm/s
    acc: np.ndarray        # [F, 2] cm/s^2
    
    cx: pd.Series          # Series tọa độ X (để dùng rolling)
    cy: pd.Series          # Series tọa độ Y
    speed_series: pd.Series # Series tốc độ
    
    raw_df: Optional[pd.DataFrame] = None # Dữ liệu gốc các bộ phận 


# =============================================================================
# 3. FEATURE EXTRACTOR
# =============================================================================
class FeatureExtractor:
    """
    Class trích xuất đặc trưng hành vi từ dữ liệu tracking.
    """
    def __init__(self, fps: float, pix_per_cm: float, smooth_sigma: float = 1.0, use_pairwise: bool = True):
        # Map tham số từ init vào Config
        self.cfg = FeatureConfig(
            fps=float(fps), 
            pix_per_cm=float(pix_per_cm), 
            smooth_sigma=smooth_sigma,
            use_pairwise=use_pairwise
        )
        
        # Đăng ký các hàm feature sẽ chạy
        self.feature_registry = {
            "kinematics": self._feat_basic_kinematics,
            "multiscale": self._feat_multiscale,
            "long_range": self._feat_long_range,
            "cumulative": self._feat_cumulative,
            "curvature": self._feat_curvature,
            "speed_asym": self._feat_speed_asym,
            "gauss_shift": self._feat_gauss_shift,
            "avoid": self._feat_avoidance_trajectory,
            "pose": self._feat_pose_shape,
            "a": self._feat_follow_pattern,
            "b": self._feat_shortburst_social,
            "pairwise": self._feat_pairwise
        }

    # --- Helpers ---
    def _scale(self, n_frames_30fps: int) -> int:
        """Quy đổi số frame từ chuẩn 30fps sang fps thực tế của video."""
        return max(1, int(round(n_frames_30fps * self.cfg.fps / 30.0)))

    def _to_cm(self, arr):
        """Chuyển pixel -> cm."""
        return arr / self.cfg.pix_per_cm

    def _smooth(self, x):
        """Làm mượt dữ liệu bằng Gaussian filter."""
        if self.cfg.smooth_sigma is None or x.shape[0] < 3: return x
        if np.all(np.isnan(x)): return x
        return gaussian_filter1d(x, sigma=self.cfg.smooth_sigma, axis=0, mode="nearest")

    def _forward_fill_nan(self, pos):
        """
        Điền dữ liệu thiếu (NaN) bằng giá trị hợp lệ trước đó (Forward Fill).
        """
        if np.all(np.isnan(pos)):
            return np.zeros_like(pos)

        pos_ffill = pos.copy()
        mask = np.any(~np.isnan(pos_ffill), axis=1)
        if not mask.any():
            return np.zeros_like(pos_ffill)

        valid_idx = np.where(mask)[0]
        first, last = valid_idx[0], valid_idx[-1]
        pos_ffill[:first] = pos_ffill[first]
        pos_ffill[last + 1:] = pos_ffill[last]
        df_temp = pd.DataFrame(pos_ffill)
        df_temp = df_temp.ffill()
        return df_temp.to_numpy()
    
    def _speed_series(self, cx: pd.Series, cy: pd.Series) -> pd.Series:
        dx = cx.diff()
        dy = cy.diff()
        v = np.hypot(dx, dy).fillna(0.0) * self.cfg.fps
        return v.astype("float32")
    
    def _roll_future_mean(self, s: pd.Series, w: int, min_p: int = 1) -> pd.Series:
        return s.iloc[::-1].rolling(w, min_periods=min_p).mean().iloc[::-1]

    def _roll_future_var(self, s: pd.Series, w: int, min_p: int = 2) -> pd.Series:
        return s.iloc[::-1].rolling(w, min_periods=min_p).var().iloc[::-1]

    # --- Core Logic ---
    def _compute_kinematics(self, pos_px: np.ndarray):
        """
        Tính toán vật lý cơ bản: Pos(cm), Vel, Speed, Acc.
        Input: Array [Frames, 2] (pixel).
        Output: Tuple (pos_cm, vel, speed, acc).
        """
        pos_ffill = self._forward_fill_nan(pos_px)
        pos_cm = self._to_cm(pos_ffill.astype(np.float32))
        pos_cm = self._smooth(pos_cm)                                               # [F, 2]

        dt = 1.0 / self.cfg.fps
        vel = np.zeros_like(pos_cm, dtype=np.float32)
        vel[1:] = (pos_cm[1:] - pos_cm[:-1]) / dt                                   # [F, 2: (vx, vy)]
        speed = np.linalg.norm(vel, axis=1, keepdims=True).astype(np.float32)       # [F, 1]

        acc = np.zeros_like(pos_cm, dtype=np.float32)                          
        acc[1:] = (vel[1:] - vel[:-1]) / dt                                         # [F, 2:(ax, ay)]
        return pos_cm.astype(np.float32), vel, speed, acc

    def _build_context(self, frames, pos_px, mouse_df=None) -> AgentContext:
        """
        Tạo AgentContext chứa đầy đủ thông tin vật lý của 1 con chuột.
        """
        p, v, s, a = self._compute_kinematics(pos_px)
        idx = pd.Index(frames, name="frame")
        
        return AgentContext(
            idx=idx, pos=p, vel=v, speed=s, acc=a, 
            cx=pd.Series(p[:, 0], index=idx), 
            cy=pd.Series(p[:, 1], index=idx), 
            speed_series=pd.Series(s[:, 0], index=idx), 
            raw_df=mouse_df
        )

    # --- Feature Modules ---
    def _feat_basic_kinematics(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Lấy các giá trị thô: tọa độ x, y, vận tốc vx, vy, tốc độ, gia tốc ax, ay.
        """
        return {
            "a_x": ctx.pos[:, 0], "a_y": ctx.pos[:, 1],
            "a_vx": ctx.vel[:, 0], "a_vy": ctx.vel[:, 1],
            "a_speed": ctx.speed[:, 0],
            "a_ax": ctx.acc[:, 0], "a_ay": ctx.acc[:, 1]
        }

    def _feat_multiscale(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Tính tốc độ trung bình (Mean) và độ lệch chuẩn (Std) ở đa mức thời gian.
        Feature 'sp_ratio' đo độ bùng nổ (Burstiness).
        """
        feats = {}
        speed = ctx.speed_series
        frame_scales = [10, 40, 160]
        for scale in frame_scales:
            ws = self._scale(scale)
            if len(speed) >= ws:
                roller = speed.rolling(ws, min_periods=max(1, ws//4), center=True)
                feats[f"sp_m{scale}"] = roller.mean().astype("float32")
                feats[f"sp_s{scale}"] = roller.std().astype("float32")
        feats[f"sp_ratio"] = feats["sp_m10"] / (feats["sp_m160"] + 1e-6)
        return feats 

    
        
    def _feat_long_range(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Đặc trưng ngữ cảnh dài hạn:
        - x_ml, y_ml: Vị trí trung bình trong quá khứ.
        - sp_pct: Xếp hạng (percentile) của tốc độ hiện tại so với quá khứ.
        """
        feats: Dict[str, pd.Series] = {}
        speed = ctx.speed_series

        for window in [120, 240]:
            ws = self._scale(window)
            if len(ctx.cx) >= ws:
                feats[f"x_ml{window}"] = ctx.cx.rolling(ws, min_periods=max(5, ws // 6), center=True).mean()
                feats[f"y_ml{window}"] = ctx.cy.rolling(ws, min_periods=max(5, ws // 6), center=True).mean()

        for span in [60, 120]:
            s = self._scale(span)
            feats[f"x_e{span}"] = ctx.cx.ewm(span=s, min_periods=1).mean()
            feats[f"y_e{span}"] = ctx.cy.ewm(span=s, min_periods=1).mean()

        for window in [60, 120]:
            ws = self._scale(window)
            if len(speed) >= ws:
                feats[f"sp_pct{window}"] = speed.rolling(
                    ws, min_periods=max(5, ws // 6), center=True
                ).rank(pct=True)
        return feats
    

    def _feat_curvature(self, ctx: AgentContext, **kwargs) -> Dict:
        feats = {}

        vel_x, vel_y = ctx.vel[:, 0], ctx.vel[:, 1]
        acc_x, acc_y = ctx.acc[:, 0], ctx.acc[:, 1]
        cross_prod = vel_x * acc_y - vel_y * acc_x
        vel_mag = np.sqrt(vel_x**2 + vel_y**2)
        moving_mask = vel_mag > 2.0
        vel_mag_safe = np.maximum(vel_mag, 0.1 / self.cfg.fps)
        raw_curv = cross_prod / (vel_mag_safe**3)
        raw_curv = np.where(moving_mask, raw_curv, 0.0)
        min_turn_radius_cm = 0.5
        max_k = 1.0 / min_turn_radius_cm
        raw_curv = np.clip(raw_curv, -max_k, max_k)
        abs_curv = np.abs(raw_curv)
        abs_curv_series = pd.Series(abs_curv, index=ctx.idx)

        for w in [30, 60]:
            ws = self._scale(w)
            min_p = max(ws // 3, 1)
            feats[f"curv_mean_{w}"] = abs_curv_series.rolling(ws, min_periods=min_p).mean()

        angle = np.arctan2(vel_y, vel_x)
        angle_series = pd.Series(angle, index=ctx.idx)
        angle_change = np.abs(angle_series.diff().fillna(0.0))
        angle_change = np.where(angle_change > np.pi, 2 * np.pi - angle_change, angle_change)
        angle_change_series = pd.Series(angle_change, index=ctx.idx)
        angle_change_series = pd.Series(np.where(moving_mask, angle_change_series, 0.0), index=ctx.idx)

        ws = self._scale(30)
        feats["turn_rate_30"] = angle_change_series.rolling(ws, min_periods=max(ws // 3, 1)).sum()

        return feats
    
    def _feat_cumulative(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Tổng quãng đường di chuyển trong một khoảng thời gian dài xung quanh frame hiện tại.
        """
        feats = {}
        L = max(1, self._scale(180))
        step = np.hypot(ctx.cx.diff(), ctx.cy.diff()).fillna(0.0)
        path = step.rolling(2 * L + 1, min_periods=max(5, L // 6), center=True).sum()
        feats["path_cum180"] =  path.fillna(0.0).astype("float32")
        return feats

    def _feat_speed_asym(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Bất đối xứng tốc độ (Tương lai - Quá khứ).
        """
        w = max(3, self._scale(30))
        v = ctx.speed_series
        v_past = v.rolling(w, min_periods=1).mean()
        v_fut = self._roll_future_mean(v, w, min_p=1)
        return {"spd_asym_1s": (v_fut - v_past).fillna(0.0)}
    
    def _feat_gauss_shift(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Độ lệch Gaussian (KL Divergence) giữa quá khứ và tương lai.
        Đo lường sự thay đổi trạng thái thống kê.
        """
        w = max(5, self._scale(30))
        v = ctx.speed_series
        mu_p = v.rolling(w, min_periods=1).mean()
        va_p = v.rolling(w, min_periods=1).var().clip(lower=1e-6)
        mu_f = self._roll_future_mean(v, w, min_p=1)
        va_f = self._roll_future_var(v, w, min_p=1).clip(lower=1e-6)

        kl_pf = 0.5 * (
            (va_p / va_f) + ((mu_f - mu_p) ** 2) / va_f - 1.0 + np.log(va_f / va_p)
        )
        kl_fp = 0.5 * (
            (va_f / va_p) + ((mu_p - mu_f) ** 2) / va_p - 1.0 + np.log(va_p / va_f)
        )
        return {
            "spd_symkl_1s": (kl_pf + kl_fp).replace([np.inf, -np.inf], np.nan).fillna(0.0)
        }

    def _feat_avoidance_trajectory(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        """
        Tính toán quỹ đạo né tránh:
        1. Relative Heading: Góc di chuyển so với hướng tới đối thủ.
        2. Future Distance Gain: Dự báo xem hành động này có giúp chuột ra xa đối thủ trong tương lai không.
        """
        feats = {}
        if target_ctx is None: 
            return feats

        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
        rel_vec = target_ctx.pos - ctx.pos
        # Góc hướng tới địch (Angle to Target)
        angle_to_target = np.arctan2(rel_vec[:, 1], rel_vec[:, 0])
        
        # Góc di chuyển của Tôi (My Heading)
        my_heading = np.arctan2(ctx.vel[:, 1], ctx.vel[:, 0])
        
        # Độ lệch góc (Absolute Difference)
        # Cần xử lý wrap góc (ví dụ: lệch giữa 179 độ và -179 độ là 2 độ chứ ko phải 358)
        diff = np.abs(angle_to_target - my_heading)
        diff = np.minimum(diff, 2*np.pi - diff) # Chuẩn hóa về [0, pi]
        
        # Feature: Cosine của góc lệch
        # 1.0 (0 độ) -> Lao vào
        # 0.0 (90 độ) -> AVOID (Lách ngang)
        # -1.0 (180 độ) -> Escape
        feats["heading_rel_cos"] = pd.Series(np.cos(diff), index=idx, dtype="float32")
        
        # Feature: Góc lệch tuyệt đối (đổi ra độ cho dễ hình dung nếu cần, ở đây để rad)
        feats["heading_rel_abs"] = pd.Series(diff, index=idx, dtype="float32")


        # --- 2. FUTURE DISTANCE GAIN (Hiệu quả tránh né) ---
        # "Sau 15 frame (0.5s) hoặc 30 frame (1s), mình có xa nó ra không?"
        
        dist_now = np.linalg.norm(rel_vec, axis=1)
        s_dist = pd.Series(dist_now, index=idx)
        
        scales = [15, 30] # 0.5s và 1s
        for w in scales:
            ws = self._scale(w)
            
            # Lấy khoảng cách ở tương lai (shift ngược lên)
            # s.shift(-ws) là giá trị của t + ws
            dist_future = s_dist.shift(-ws)
            gain = dist_future - s_dist
            
            feats[f"dist_gain_{w}f"] = gain.fillna(0.0).astype("float32")

        return feats
    
    def _extract_part(self, ctx: AgentContext, part: str) -> Optional[np.ndarray]:
        if ctx.raw_df is None: return None
        if part not in ctx.raw_df.columns.get_level_values(0): return None
        try:
            sub_df = ctx.raw_df.xs(part, axis=1, level=0)[["x", "y"]].reindex(ctx.idx)
        except KeyError: return None
        raw = sub_df.to_numpy()
        raw = self._forward_fill_nan(raw)
        cm = self._to_cm(raw.astype(np.float32))
        return self._smooth(cm)
    
    def _extract_parts_dict(self, ctx: AgentContext, parts: List[str] = None) -> Dict[str, Optional[np.ndarray]]:
        out = {}
        for p in parts:
            out[p] = self._extract_part(ctx, p)
        return out
        
    def _feat_pose_shape(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Placeholder cho các đặc trưng hình dáng (Elongation, Body Angle...).
        """
        feats = {}

        def zero(): return pd.Series(0.0, index=ctx.idx, dtype="float32")

        def dist(k1, k2):
            p1, p2 = parts.get(k1), parts.get(k2)
            if p1 is None or p2 is None: return zero()
            d = np.linalg.norm(p1 - p2, axis=1)
            return pd.Series(d, index=ctx.idx, dtype="float32")
        
        def body_angle():
            if parts.get("nose") is None: return zero()
            if parts.get("body_center") is None: return zero()
            if parts.get("tail_base") is None: return zero()

            v1 = parts.get("nose") - parts.get("body_center")
            v2 = parts.get("tail_base") - parts.get("body_center")
            dot_product = np.sum(v1 * v2, axis=1)
            mag = np.linalg.norm(v1, axis=1) * np.linalg.norm(v2, axis=1)
            cos_angle = np.clip(dot_product / (mag + 1e-6), -1.0, 1.0).astype("float32")
            return cos_angle
        
        def elongation():
            if parts.get("nose")          is None: return zero()
            if parts.get("tail_base")     is None: return zero()
            if parts.get("lateral_left")  is None: return zero()
            if parts.get("lateral_right") is None: return zero()

            d1 = dist("nose", "tail_base")
            d2 = dist("lateral_left", "lateral_right")
            elongation = d1 / (d2 + 1e-6).astype("float32")
            return elongation

        
        
        def vel(part: str, n_frames_30fps: int) -> Dict:
            part_pos = self._extract_part(ctx, part)
            if part_pos is None: return zero()
            
            s_x = pd.Series(part_pos[:, 0], index=ctx.idx)
            s_y = pd.Series(part_pos[:, 1], index=ctx.idx)
            raw_speed = self._speed_series(s_x, s_y)

            ws = self._scale(n_frames_30fps)
            val = raw_speed.rolling(ws, min_periods=1, center=True).mean()
            return val.astype("float32")


        target_parts = ["nose", "body_center", "tail_base", 
                        "ear_left", "ear_right", 
                        "lateral_left", "lateral_right"]
        
        parts = self._extract_parts_dict(ctx, target_parts)

        # feats["a_body_width"]                = dist("lateral_left", "lateral_right")
        # feats["aa_nose_bodycenter_dist"]     = dist("nose", "body_center")
        # feats["aa_nose_tailbase_dist"]       = dist("nose", "tail_base")
        # feats["aa_bodycenter_tailbase_dist"] = dist("body_center", "tail_base")
        
        # feats["aa_bodycenter_ear_l_dist"]    = dist("body_center", "ear_left")
        # feats["aa_bodycenter_ear_r_dist"]    = dist("body_center", "ear_right")
        # feats["aa_bodycenter_lateral_l_dist"]= dist("body_center", "lateral_left")
        # feats["aa_bodycenter_lateral_r_dist"]= dist("body_center", "lateral_right")
        
        feats["a_body_angle"]                = body_angle()
        # feats["a_elongation"]                = elongation()
        feats["a_tail_base_vel_500ms"]       = vel("tail_base", 15)
        feats["a_tail_base_vel_1000ms"]      = vel("tail_base", 30)
        feats["a_tail_base_vel_2000ms"]      = vel("tail_base", 60)
        feats["a_tail_base_vel_3000ms"]      = vel("tail_base", 90)
        feats["a_nose_vel_500ms"]            = vel("nose", 15)
        feats["a_nose_vel_1000ms"]           = vel("nose", 30)
        feats["a_nose_vel_2000ms"]           = vel("nose", 60)
        feats["a_nose_vel_3000ms"]           = vel("nose", 90)
        feats["a_ear_right_vel_500ms"]       = vel("ear_right", 15)
        feats["a_ear_right_vel_1000ms"]      = vel("ear_right", 30)
        feats["a_ear_right_vel_2000ms"]      = vel("ear_right", 60)
        feats["a_ear_right_vel_3000ms"]      = vel("ear_right", 90)
        # len_1 = dist("tail_base", "tail_midpoint")
        # len_2 = dist("tail_midpoint", "tail_tip")
        # len_full = dist("tail_base", "tail_tip")
        # feats["tail_curl"] = ((len_1 + len_2) / (len_full + 1e-6)).astype("float32")
        return feats

    def _feat_shortburst_social(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        """
        Short-burst social features (10–30 frames) đặc biệt cho attack / chase / escape.
        Chỉ dùng được khi có target_ctx.
        """
        feats = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- Lấy lại vài quantity cơ bản từ pairwise/avoidance ---
        # vector Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        rel_dist = np.linalg.norm(rel_vec, axis=1)
        rel_dist_s = pd.Series(rel_dist, index=idx, dtype="float32")
    
        # unit vector
        rel_dist_safe = np.where(rel_dist == 0, 1e-6, rel_dist)
        u_vec = rel_vec / rel_dist_safe[:, None]
    
        # velocity dọc trục nối (approach speed)
        a_vel = ctx.vel
        t_vel = target_ctx.vel
        a_along = np.sum(a_vel * u_vec, axis=1)                # +: lao vào target
        t_along = np.sum(t_vel * (-u_vec), axis=1)             # +: target lao vào agent
        rel_along = np.sum((a_vel - t_vel) * u_vec, axis=1)    # +: lại gần nhau
    
        a_along_s = pd.Series(a_along, index=idx, dtype="float32")
        t_along_s = pd.Series(t_along, index=idx, dtype="float32")
        rel_along_s = pd.Series(rel_along, index=idx, dtype="float32")
    
        # speed agent / target
        a_speed = ctx.speed_series
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32"
        )
    
        # heading_rel_cos ~ escape / approach
        # vector body của agent
        # (reuse idea từ _feat_pairwise)
        # head ~ nose, tail ~ tail_base/body_center
        parts_a = self._extract_parts_dict(ctx, ["nose", "tail_base", "body_center"])
        head_a = parts_a.get("nose")
        tail_a = parts_a.get("tail_base") if parts_a.get("tail_base") is not None else parts_a.get("body_center")
    
        if head_a is not None and tail_a is not None:
            body_vec_a = head_a - tail_a
            dot = np.sum(body_vec_a * rel_vec, axis=1)
            mag = np.linalg.norm(body_vec_a, axis=1) * rel_dist_safe
            heading_cos = np.clip(dot / (mag + 1e-6), -1.0, 1.0)
            heading_cos_s = pd.Series(heading_cos, index=idx, dtype="float32")
        else:
            heading_cos_s = zero()
    
        # --- Rolling window 10, 20, 30 frames (ở fps gốc) ---
        for w30 in [10, 20]:
            ws = self._scale(w30)
            min_p = max(1, ws // 3)
    
            # Attack-like: approach mạnh, khoảng cách giảm nhanh
            feats[f"sb_att_approach_mean_{w30}"] = a_along_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_att_rel_along_mean_{w30}"] = rel_along_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_att_dist_delta_{w30}"] = (rel_dist_s - rel_dist_s.shift(ws)).fillna(0.0)
    
            # Chase-like: agent & target đều nhanh, dist tương đối nhỏ
            feats[f"sb_chase_speed_agent_mean_{w30}"] = a_speed.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_chase_speed_target_mean_{w30}"] = t_speed.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_chase_dist_mean_{w30}"] = rel_dist_s.rolling(ws, min_periods=min_p).mean()
    
            # Escape-like: heading ngược, dist tăng nhanh
            feats[f"sb_esc_heading_cos_mean_{w30}"] = heading_cos_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_esc_dist_gain_{w30}"] = (rel_dist_s.shift(-ws) - rel_dist_s).fillna(0.0)
    
        # clip & fillna
        for k, v in feats.items():
            feats[k] = v.replace([np.inf, -np.inf], np.nan).fillna(0.0).astype("float32")
    
        return feats


    def _feat_follow_pattern(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        """
        Đặc trưng hành vi FOLLOW:
          - Agent ở gần target
          - Cùng hướng (body + velocity)
          - Tốc độ vừa phải
          - Khoảng cách tương đối ổn định trong 0.5–1s
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- 1. CÁC ĐẠI LƯỢNG CƠ BẢN ---
        # Vector Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        rel_dist = np.linalg.norm(rel_vec, axis=1)
        rel_dist_s = pd.Series(rel_dist, index=idx, dtype="float32")
    
        # Speed agent/target
        a_speed = ctx.speed_series.astype("float32")
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32",
        )
    
        # Body vector: nose - tail/body_center
        parts_a = self._extract_parts_dict(ctx, ["nose", "tail_base", "body_center"])
        parts_t = self._extract_parts_dict(target_ctx, ["nose", "tail_base", "body_center"])
    
        def body_vec(parts_dict):
            head = parts_dict.get("nose")
            tail = parts_dict.get("tail_base")
            if tail is None:
                tail = parts_dict.get("body_center")
            if head is None or tail is None:
                return None
            return head - tail
    
        a_body = body_vec(parts_a)
        t_body = body_vec(parts_t)
    
        if a_body is not None and t_body is not None:
            dot_bt = np.sum(a_body * t_body, axis=1)
            mag_bt = np.linalg.norm(a_body, axis=1) * np.linalg.norm(t_body, axis=1)
            cos_body = np.clip(dot_bt / (mag_bt + 1e-6), -1.0, 1.0)
            cos_body_s = pd.Series(cos_body, index=idx, dtype="float32")
        else:
            cos_body_s = zero()
    
        # Velocity hướng
        a_vel = ctx.vel
        t_vel = target_ctx.vel
        a_speed_np = np.linalg.norm(a_vel, axis=1)
        t_speed_np = np.linalg.norm(t_vel, axis=1)
        moving_mask = (a_speed_np > 1e-3) & (t_speed_np > 1e-3)
    
        # cos giữa hướng velocity 2 con
        dot_v = np.sum(a_vel * t_vel, axis=1)
        mag_v = a_speed_np * t_speed_np + 1e-6
        cos_vel = np.zeros_like(dot_v, dtype="float32")
        cos_vel[moving_mask] = np.clip(dot_v[moving_mask] / mag_v[moving_mask], -1.0, 1.0)
        cos_vel_s = pd.Series(cos_vel, index=idx, dtype="float32")
    
        # --- 2. WINDOW NGẮN (FOLLOW LÀ PATTERN DÀI HƠN ATTACK) ---
        for w30 in [15, 30, 60]:   # ~0.5s, 1s, 2s
            ws = self._scale(w30)
            min_p = max(ws // 3, 1)
    
            # Khoảng cách trung bình & độ dao động
            m_dist = rel_dist_s.rolling(ws, min_periods=min_p).mean()
            s_dist = rel_dist_s.rolling(ws, min_periods=min_p).std()
    
            # Cùng hướng (body + velocity)
            m_cos_body = cos_body_s.rolling(ws, min_periods=min_p).mean()
            m_cos_vel  = cos_vel_s.rolling(ws, min_periods=min_p).mean()
    
            # Tốc độ vừa phải
            m_sp_a = a_speed.rolling(ws, min_periods=min_p).mean()
            m_sp_t = t_speed.rolling(ws, min_periods=min_p).mean()
    
            feats[f"follow_dist_mean_{w30}"] = m_dist
            feats[f"follow_dist_std_{w30}"]  = s_dist
            feats[f"follow_cos_body_mean_{w30}"] = m_cos_body
            feats[f"follow_cos_vel_mean_{w30}"]  = m_cos_vel
            feats[f"follow_speed_agent_mean_{w30}"] = m_sp_a
            feats[f"follow_speed_target_mean_{w30}"] = m_sp_t
    
        # Clean
        for k, v in feats.items():
            feats[k] = (
                v.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats
        

    def _feat_pairwise(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict:
        """
        Đặc trưng tương tác cặp đôi (Pairwise): Khoảng cách, Tốc độ tiếp cận.
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None: 
            return feats

        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")

        # --- 1. KHOẢNG CÁCH CƠ BẢN (DISTANCES) ---
        # Vector nối Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        dist = np.linalg.norm(rel_vec, axis=1)
        feats["rel_dist"] = pd.Series(dist, index=idx, dtype="float32")

        # --- 2. KHOẢNG CÁCH CHI TIẾT (NOSE-TO-PART) ---
        # Lấy các bộ phận quan trọng
        my_parts = self._extract_parts_dict(ctx, ["nose", "neck"])
        target_parts = self._extract_parts_dict(target_ctx, 
            ["nose", "tail_base", "body_center", "ear_left", "ear_right", 
             "lateral_left", "lateral_right"])

        def dist_ab(pt_a, pt_b):
            if pt_a is None or pt_b is None: return zero()
            d = np.linalg.norm(pt_a - pt_b, axis=1)
            return pd.Series(d, index=idx, dtype="float32")

        an, tn = my_parts["nose"], target_parts["nose"]
        feats["dist_nose_nose"] = dist_ab(an, tn)
        feats["dist_nose_tail"] = dist_ab(an, target_parts["tail_base"])
        feats["dist_nose_body"] = dist_ab(an, target_parts["body_center"])
        feats["dist_nose_el"]   = dist_ab(an, target_parts["ear_left"])
        feats["dist_nose_er"]   = dist_ab(an, target_parts["ear_right"])
        feats["dist_nose_tll"]  = dist_ab(an, target_parts["lateral_left"])
        feats["dist_nose_tlr"]  = dist_ab(an, target_parts["lateral_right"])
        # feats["dist_nose_tt"]  = dist_ab(an, target_parts["tail_tip"])

        # --- 3. ĐỊNH HƯỚNG & GÓC NHÌN (ORIENTATION & GAZE) ---
        # Helper lấy vector cơ thể (Mũi - Đuôi/Thân)
        def get_body_vec(parts_dict):
            head = parts_dict.get("nose")
            # Ưu tiên đuôi, nếu ko có thì dùng thân
            tail = parts_dict.get("tail_base")
            if tail is None: tail = parts_dict.get("body_center") # Fallback
            
            if head is not None and tail is not None:
                return head - tail
            return None

        a_vec = get_body_vec(my_parts)
        t_vec = get_body_vec(target_parts)

        # A. Body Cosine: Hai con cùng chiều hay ngược chiều?
        if a_vec is not None and t_vec is not None:
            dot = np.sum(a_vec * t_vec, axis=1)
            mags = np.linalg.norm(a_vec, axis=1) * np.linalg.norm(t_vec, axis=1)
            feats["body_cosine"] = pd.Series(
                np.clip(dot / (mags + 1e-6), -1.0, 1.0), index=idx, dtype="float32"
            )
        else:
            feats["body_cosine"] = zero()

        # B. Gaze Cosine: Tôi có đang nhìn về phía Target không?
        # Vector ánh nhìn = Target_Pos - My_Pos = rel_vec
        if a_vec is not None:
            dot_gaze = np.sum(a_vec * rel_vec, axis=1)
            mag_a = np.linalg.norm(a_vec, axis=1)
            # dist đã tính ở bước 1
            feats["gaze_cosine"] = pd.Series(
                np.clip(dot_gaze / (mag_a * dist + 1e-6), -1.0, 1.0),
                index=idx, dtype="float32"
            )
        else:
            feats["gaze_cosine"] = zero()

        # --- 4. PHÂN RÃ VẬN TỐC (VELOCITY DECOMPOSITION) - CHÌA KHÓA CHO AVOID/ESCAPE ---
        # Vector đơn vị hướng về địch (u)
        dist_safe = dist.copy()
        dist_safe[dist_safe == 0] = 1e-6
        u_vec = rel_vec / dist_safe[:, None]

        # a_vel và t_vel lấy từ Context
        a_vel, t_vel = ctx.vel, target_ctx.vel

        # A. Approach Speed (Vận tốc dọc trục nối 2 con)
        # Dương: Lao vào nhau | Âm: Chạy ra xa nhau
        a_along = np.sum(a_vel * u_vec, axis=1)
        t_along = np.sum(t_vel * (-u_vec), axis=1) # Target hướng ngược lại
        rel_along = np.sum((a_vel - t_vel) * u_vec, axis=1)

        # B. Lateral Speed (Vận tốc ngang - Vuông góc trục nối)
        # Vector chiếu: v_proj = (v . u) * u
        a_proj = a_along[:, None] * u_vec
        a_lat_vec = a_vel - a_proj
        a_lat_speed = np.linalg.norm(a_lat_vec, axis=1)

        feats["approach_speed_agent"]  = pd.Series(a_along, index=idx, dtype="float32")
        feats["approach_speed_target"] = pd.Series(t_along, index=idx, dtype="float32")
        feats["approach_speed_rel"]    = pd.Series(rel_along, index=idx, dtype="float32")
        feats["lateral_speed_agent"]   = pd.Series(a_lat_speed, index=idx, dtype="float32")
        return feats


    # --- Methods tương thích ---
    
    def build_pose_tensor(self, tracking: pd.DataFrame):
        """
        Chuyển dữ liệu tracking (DataFrame) sang Tensor [Frames, Mice, 2] và Dict chi tiết.
        """
        tracking = tracking.sort_values("video_frame")
        frames = np.sort(tracking["video_frame"].unique())
        
        pvid = tracking.pivot(
            index="video_frame", 
            columns=["mouse_id", "bodypart"], 
            values=["x", "y"]
        )
        pvid = pvid.reorder_levels([1, 2, 0], axis=1).sort_index(axis=1).astype("float32")
        mouse_ids = list(pvid.columns.get_level_values(0).unique())
        pos = np.full((len(frames), len(mouse_ids), 2), np.nan, dtype=np.float32)
        per_mouse_df = {}
        
        for i, mid in enumerate(mouse_ids):
            single = pvid[mid]
            per_mouse_df[mid] = single
            
            if "body_center" in single.columns.get_level_values(0):
                cx = single["body_center"]["x"]
                cy = single["body_center"]["y"]
            else:
                cx = single.xs("x", level=1, axis=1).mean(axis=1)
                cy = single.xs("y", level=1, axis=1).mean(axis=1)
            
            pos[:, i, 0] = cx.reindex(frames).values
            pos[:, i, 1] = cy.reindex(frames).values
            
        return frames, mouse_ids, pos, per_mouse_df

    def extract_agent_target(
        self, 
        frames: np.ndarray, 
        mouse_ids: List[Any], 
        pos: np.ndarray, 
        agent_id: Any, 
        target_id: Any, 
        per_mouse_df: Dict = None
    ) -> pd.DataFrame:
        """
        Trích xuất đặc trưng cho cặp (Agent, Target).
        """
        try:
            aid_idx = mouse_ids.index(agent_id)
        except ValueError:
            return pd.DataFrame() 

        # 1. Build Agent Context
        ctx_agent = self._build_context(
            frames, 
            pos[:, aid_idx, :], 
            per_mouse_df.get(agent_id) if per_mouse_df else None
        )

        # 2. Build Target Context
        ctx_target = None
        if self.cfg.use_pairwise and target_id is not None and target_id in mouse_ids:
             tid_idx = mouse_ids.index(target_id)
             ctx_target = self._build_context(
                 frames, 
                 pos[:, tid_idx, :], 
                 per_mouse_df.get(target_id) if per_mouse_df else None
             )

        # 3. Run all features
        all_data = {}
        for func_name, func in self.feature_registry.items():
            out_dict = func(ctx_agent, target_ctx=ctx_target)
            all_data.update(out_dict)

        df_out = pd.DataFrame(all_data, index=ctx_agent.idx)
        df_out = df_out.replace([np.inf, -np.inf], np.nan).fillna(0.0)
        
        return df_out.reindex(sorted(df_out.columns), axis=1)

#================================================================================
#================================================================================
#================================================================================


from __future__ import annotations
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import gc
import itertools
import json
import time
from collections import defaultdict

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedGroupKFold

# (Trên Kaggle) dùng metric chính thức
import sys
sys.path.append("/kaggle/usr/lib/mabe-f-beta")
from metric import score   # hàm score(submission_df, dataset_df)

# =========================================================
# 1. ĐƯỜNG DẪN & CẤU HÌNH
# =========================================================

INPUT_DIR = Path("/kaggle/input/MABe-mouse-behavior-detection")
TRAIN_TRACKING_DIR = INPUT_DIR / "train_tracking"
TRAIN_ANNOTATION_DIR = INPUT_DIR / "train_annotation"
TEST_TRACKING_DIR = INPUT_DIR / "test_tracking"


WORKING_DIR = Path("/kaggle/working")
RESULTS_DIR = Path(r"/kaggle/input/results-xgb-fe")
RESULTS_DIR.mkdir(exist_ok=True, parents=True)

INDEX_COLS = ["video_id", "agent_id", "target_id", "video_frame"]

# hành vi “self” vs “pair” giống notebook (có thể chỉnh nếu muốn)
SELF_BEHAVIORS = [
    "biteobject", "climb", "dig", "exploreobject", "freeze",
    "genitalgroom", "huddle", "rear", "rest", "run", "selfgroom",
]
PAIR_BEHAVIORS = [
    "allogroom", "approach", "attack", "attemptmount", "avoid",
    "chase", "chaseattack", "defend", "disengage", "dominance",
    "dominancegroom", "dominancemount", "ejaculate", "escape",
    "flinch", "follow", "intromit", "mount", "reciprocalsniff",
    "shepherd", "sniff", "sniffbody", "sniffface", "sniffgenital",
    "submit", "tussle",
]


# =========================================================
# 2. ĐỌC METADATA & HELPER
# =========================================================

def load_metadata() -> pd.DataFrame:
    train_meta = pd.read_csv(INPUT_DIR / "train.csv")
    return train_meta


def get_video_params(video_id: Any, meta: pd.DataFrame) -> Tuple[float, float]:
    """Lấy fps, pix_per_cm cho video từ train.csv."""
    row = meta.loc[meta["video_id"] == video_id]
    if row.empty:
        raise KeyError(f"video_id={video_id} không có trong train.csv")
    row = row.iloc[0]

    # giống notebook: cột "frames per second" & "pix per cm (approx)"
    fps = float(row["frames_per_second"])
    pix_per_cm = float(row["pix_per_cm_approx"])
    if not np.isfinite(pix_per_cm) or pix_per_cm <= 0:
        pix_per_cm = 1.0
    return fps, pix_per_cm


def load_tracking(lab_id: str, video_id: Any) -> pd.DataFrame:
    """Đọc tracking parquet → pandas (schema: video_frame, mouse_id, bodypart, x, y)."""
    path = TRAIN_TRACKING_DIR / str(lab_id) / f"{video_id}.parquet"
    if not path.exists():
        raise FileNotFoundError(path)
    df = pd.read_parquet(path)
    return df

def load_tracking_test(lab_id: str, video_id: Any) -> pd.DataFrame:
    """Đọc tracking parquet của test → pandas."""
    path = INPUT_DIR / "test_tracking" / str(lab_id) / f"{video_id}.parquet"
    if not path.exists():
        raise FileNotFoundError(path)
    return pd.read_parquet(path)


def load_annotation(lab_id: str, video_id: Any) -> pd.DataFrame:
    """Đọc annotation (agent_id, target_id, action, start_frame, stop_frame)."""
    path = TRAIN_ANNOTATION_DIR / str(lab_id) / f"{video_id}.parquet"
    if not path.exists():
        # không có label cho video này
        return pd.DataFrame(
            columns=["agent_id", "target_id", "action", "start_frame", "stop_frame"]
        )
    ann = pd.read_parquet(path)
    return ann[["agent_id", "target_id", "action", "start_frame", "stop_frame"]]


# =========================================================
# 3. TÍNH FEATURE PER-FRAME BẰNG FEATUREEXTRACTOR
# =========================================================

# Cache: (lab, video, agent, target) -> (frames, feature_df)
_feature_cache: Dict[Tuple[str, int, int, int], Tuple[np.ndarray, pd.DataFrame]] = {}


def get_frame_features_for_pair(
    lab_id: str,
    video_id: int,
    agent_id: int,
    target_id: int,
    meta: pd.DataFrame,
) -> Tuple[np.ndarray, pd.DataFrame]:
    """
    Tính (hoặc lấy cache) feature per-frame cho 1 video + (agent, target).
    Trả về: frames [F], features_df [F, D]
    """
    key = (str(lab_id), int(video_id), int(agent_id), int(target_id))
    if key in _feature_cache:
        return _feature_cache[key]

    fps, pix_per_cm = get_video_params(video_id, meta)
    tracking = load_tracking(lab_id, video_id)

    fe = FeatureExtractor(
        fps=fps,
        pix_per_cm=pix_per_cm,
        smooth_sigma=1.0,
        use_pairwise=True,
    )

    frames, mouse_ids, pos, per_mouse_df = fe.build_pose_tensor(tracking)

    # agent/target có thể là cùng chuột (self) hoặc khác chuột (pair)
    features_df: pd.DataFrame = fe.extract_agent_target(
        frames=frames,
        mouse_ids=mouse_ids,
        pos=pos,
        agent_id=agent_id,
        target_id=target_id,
        per_mouse_df=per_mouse_df,
    )
    # index chính là frame
    features_df.index = frames

    _feature_cache[key] = (frames, features_df)
    return frames, features_df

_feature_cache: Dict[Tuple[str, int, Any, Any], Tuple[np.ndarray, pd.DataFrame]] = {}

def get_frame_features_for_pair_test(
    lab_id: str,
    video_id: int,
    agent_id: Any,
    target_id: Any,
    test_meta: pd.DataFrame,
) -> Tuple[np.ndarray, pd.DataFrame]:
    """
    Feature per-frame cho test (video_id, agent, target).
    Trả về: frames [F], features_df [F, D]
    """
    key = (f"test_{lab_id}", int(video_id), agent_id, target_id)
    if key in _feature_cache:
        return _feature_cache[key]

    # Lấy fps, pix_per_cm_approx từ test.csv
    row = test_meta[test_meta["video_id"] == video_id].iloc[0]
    fps = float(row["frames_per_second"])
    pix_per_cm = float(row["pix_per_cm_approx"])
    if not np.isfinite(pix_per_cm) or pix_per_cm <= 0:
        pix_per_cm = 1.0

    tracking = load_tracking_test(lab_id, video_id)

    fe = FeatureExtractor(
        fps=fps,
        pix_per_cm=pix_per_cm,
        smooth_sigma=1.0,
        use_pairwise=True,
    )

    frames, mouse_ids, pos, per_mouse_df = fe.build_pose_tensor(tracking)

    features_df = fe.extract_agent_target(
        frames=frames,
        mouse_ids=mouse_ids,
        pos=pos,
        agent_id=agent_id,
        target_id=target_id,
        per_mouse_df=per_mouse_df,
    )
    features_df.index = frames

    _feature_cache[key] = (frames, features_df)
    return frames, features_df



# =========================================================
# 4. BUILD FRAME-LEVEL DATASET CHO 1 (lab_id, behavior)
# =========================================================

def build_frame_dataset_for_lab_behavior(
    lab_id: str,
    behavior: str,
    train_meta: pd.DataFrame,
    mode: str = "self",
) -> Tuple[pd.DataFrame, pd.DataFrame, np.ndarray]:
    """
    Xây tập frame-level (indices, features, labels) cho 1 (lab, behavior).

    indices: DataFrame với cột INDEX_COLS
    features: DataFrame per-frame features
    labels: np.ndarray nhị phân (0/1)
    """

    videos = (
        train_meta[train_meta["lab_id"] == lab_id]["video_id"]
        .unique()
        .tolist()
    )

    index_list = []
    feature_list = []
    label_list = []

    for video_id in videos:
        ann = load_annotation(lab_id, video_id)
        if ann.empty:
            continue

        # chỉ lấy annotation của behavior này
        ann_bhv = ann[ann["action"] == behavior]
        if ann_bhv.empty:
            continue

        # các (agent, target) cần xem
        pairs = ann_bhv[["agent_id", "target_id"]].drop_duplicates().values.tolist()

        for (agent_id, target_id) in pairs:
            if mode == "self":
                target_id_use = agent_id
            else:
                target_id_use = target_id

            frames, feat_df = get_frame_features_for_pair(
                lab_id=lab_id,
                video_id=video_id,
                agent_id=agent_id,
                target_id=target_id_use,
                meta=train_meta,
            )

            # label per-frame: frame ∈ bất kỳ [start, stop) của (agent,target,behavior)
            ann_pair = ann_bhv[
                (ann_bhv["agent_id"] == agent_id)
                & (ann_bhv["target_id"] == target_id)
            ]
            if ann_pair.empty and mode == "self":
                ann_pair = ann_bhv[ann_bhv["agent_id"] == agent_id]

            pos_frames = set()
            for _, r in ann_pair.iterrows():
                pos_frames.update(range(int(r["start_frame"]), int(r["stop_frame"])))

            if len(pos_frames) == 0:
                continue

            label = np.isin(frames, list(pos_frames)).astype("int8")
            if label.sum() == 0:
                continue


            idx_df = pd.DataFrame(
                {
                    "video_id": video_id,
                    "agent_id": agent_id,
                    "target_id": target_id,
                    "video_frame": frames,
                }
            )

            index_list.append(idx_df)
            feature_list.append(feat_df.reset_index(drop=True))
            label_list.append(label)

    if not index_list:
        return (
            pd.DataFrame(columns=INDEX_COLS),
            pd.DataFrame(),
            np.zeros(0, dtype="int8"),
        )

    indices = pd.concat(index_list, ignore_index=True)
    features = pd.concat(feature_list, ignore_index=True)
    labels = np.concatenate(label_list).astype("int8")

    assert len(indices) == len(features) == len(labels)

    return indices, features, labels


# =========================================================
# 5. TRAIN + OOF CHO 1 (lab_id, behavior)
# =========================================================

def tune_threshold(oof_pred: np.ndarray, y: np.ndarray) -> float:
    ths = np.arange(0.0, 1.005, 0.005)
    scores = [f1_score(y, (oof_pred >= th), zero_division=0) for th in ths]
    return float(ths[int(np.argmax(scores))])

#
def train_validate_one(
    lab_id: str,
    behavior: str,
    indices: pd.DataFrame,
    features: pd.DataFrame,
    labels: np.ndarray,
) -> float:
    """
    Train XGBoost binary cho 1 (lab, behavior) + lưu OOF prediction.
    Trả về: F1 trên toàn bộ OOF (frame-level).
    """
    result_dir = RESULTS_DIR / lab_id / behavior
    result_dir.mkdir(parents=True, exist_ok=True)

    n = len(labels)

    if n == 0 or labels.sum() == 0:
        oof_df = indices.copy()
        oof_df["fold"] = -1
        oof_df["prediction"] = 0.0
        oof_df["predicted_label"] = 0
        oof_df.to_parquet(result_dir / "oof_predictions.parquet", index=False)
        (result_dir / "f1.txt").write_text("0.0\n")
        return 0.0

    X = features.values.astype("float32")
    y = labels.astype("int8")
    groups = indices["video_id"].values

    folds = np.ones(n, dtype="int8") * -1
    oof_pred = np.zeros(n, dtype="float32")
    oof_label = np.zeros(n, dtype="int8")

    cv = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=42)

    for fold, (tr_idx, va_idx) in enumerate(cv.split(X, y, groups=groups)):
        fold_dir = result_dir / f"fold_{fold}"
        fold_dir.mkdir(parents=True, exist_ok=True)

        X_tr, y_tr = X[tr_idx], y[tr_idx]
        X_va, y_va = X[va_idx], y[va_idx]

        # scale_pos_weight
        pos = y_tr.sum()
        neg = len(y_tr) - pos
        scale_pos_weight = float(neg / pos) if pos > 0 else 1.0

        params = {
            "objective": "binary:logistic",
            "eval_metric": "logloss",
            "device": "cuda",
            "tree_method": "hist",
            "learning_rate": 0.05,
            "max_depth": 6,
            "min_child_weight": 5,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "scale_pos_weight": scale_pos_weight,
            "max_bin": 64,
            "seed": 42,
        }

        dtrain = xgb.QuantileDMatrix(
            X_tr,
            label=y_tr,
            feature_names=features.columns.tolist(),
            max_bin=64,
        )
        dvalid = xgb.DMatrix(
            X_va,
            label=y_va,
            feature_names=features.columns.tolist(),
        )

        evals_result: Dict[str, Dict[str, List[float]]] = {}

        early_stop = xgb.callback.EarlyStopping(
            rounds=10, metric_name="logloss", data_name="valid", maximize=False
        )

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=250,
            evals=[(dtrain, "train"), (dvalid, "valid")],
            callbacks=[early_stop],
            evals_result=evals_result,
            verbose_eval=False,
        )

        pred_va = model.predict(dvalid)
        th = tune_threshold(pred_va, y_va)

        folds[va_idx] = fold
        oof_pred[va_idx] = pred_va
        oof_label[va_idx] = (pred_va >= th).astype("int8")

        model.save_model(fold_dir / "model.json")
        with open(fold_dir / "threshold.txt", "w") as f:
            f.write(f"{th}\n")

    # lưu OOF
    oof_df = indices.copy()
    oof_df["fold"] = folds
    oof_df["prediction"] = oof_pred
    oof_df["predicted_label"] = oof_label
    oof_df.to_parquet(result_dir / "oof_predictions.parquet", index=False)

    f1 = f1_score(y, oof_label, zero_division=0)
    (result_dir / "f1.txt").write_text(f"{f1:.6f}\n")
    return float(f1)

def load_models_for_behavior_infer(lab_id: str, behavior: str):
    """
    Đọc các fold model + threshold cho (lab, behavior) từ RESULTS_DIR.
    Dùng cho inference (test).
    """
    base_dir = RESULTS_DIR / lab_id / behavior
    if not base_dir.exists():
        return []

    models = []
    for fold_dir in sorted(base_dir.glob("fold_*")):
        model_file = fold_dir / "model.json"
        thr_file = fold_dir / "threshold.txt"
        if not model_file.exists():
            continue

        booster = xgb.Booster()
        booster.load_model(str(model_file))

        if thr_file.exists():
            thr = float(thr_file.read_text().strip())
        else:
            thr = 0.5

        models.append((booster, thr))

    return models


# =========================================================
# 6. LOOP QUA TẤT CẢ BEHAVIORS TRONG 1 LAB
#    (train_all_labs_behaviors vẫn giữ nguyên, nhưng main
#     sẽ filter train_meta chỉ còn 1 lab)
# =========================================================

def train_all_labs_behaviors(train_meta: pd.DataFrame):
    """
    Loop qua từng lab trong train_meta (ở đây main đã filter chỉ còn 1 lab):
      - đọc annotation của tất cả video
      - lấy unique action xuất hiện trong lab đó
      - train 1 model/frame-level cho từng (lab, action)
    """
    labs = train_meta["lab_id"].unique().tolist()

    start_time = time.perf_counter()

    for lab_id in labs:
        # tập video của lab này
        videos = train_meta[train_meta["lab_id"] == lab_id]["video_id"].unique().tolist()

        # gom toàn bộ action thực sự có trong annotation của lab này
        behaviors_set = set()
        for vid in videos:
            ann = load_annotation(lab_id, vid)
            if ann.empty:
                continue
            behaviors_set.update(ann["action"].unique().tolist())

        behaviors = sorted(behaviors_set)
        print(f"\n===== LAB {lab_id}: {len(behaviors)} behaviors =====")

        for behavior in behaviors:
            # if behavior != "submit": continue

            mode = "self" if behavior in SELF_BEHAVIORS else "pair"

            print(f"\n=== LAB={lab_id} | behavior={behavior} | mode={mode} ===")
            indices, features, labels = build_frame_dataset_for_lab_behavior(
                lab_id=str(lab_id),
                behavior=behavior,
                train_meta=train_meta,
                mode=mode,
            )
            print(
                f"frames: {len(labels):,}, positives: {labels.sum():,}, features: "
                f"{features.shape[1] if not features.empty else 0}"
            )

            if len(labels) == 0:
                print(" -> skip (no samples)")
                continue

            f1 = train_validate_one(str(lab_id), behavior, indices, features, labels)
            elapsed = time.perf_counter() - start_time
            print(f" -> OOF F1 (frame-level): {f1:.3f} | elapsed={elapsed/60:.1f} min")



# =========================================================
# 7. GOM OOF PREDICTION → SEGMENT & TÍNH SCORE()
# =========================================================

def build_oof_submission_from_parquet(
    target_lab_id: Optional[str] = None,
) -> pd.DataFrame:
    """
    Đọc tất cả oof_predictions.parquet trong RESULTS_DIR,
    gom thành frame-level table rồi nối thành segment-level prediction
    giống inference notebook (simplified).

    Nếu target_lab_id != None thì chỉ lấy OOF của lab đó
    (vd "AdaptableSnail").
    """
    oof_files = list(RESULTS_DIR.glob("*/**/oof_predictions.parquet"))
    if not oof_files:
        raise RuntimeError("Không tìm thấy OOF parquet, hãy train trước.")

    frame_preds = []

    for path in oof_files:
        # path: results_xgb_fe/lab/behavior/oof_predictions.parquet
        parts = path.parts
        behavior = parts[-2]
        lab_id = parts[-3]

        # chỉ lấy file thuộc lab mong muốn (nếu có)
        if target_lab_id is not None and lab_id != target_lab_id:
            continue

        df = pd.read_parquet(path)
        df = df[INDEX_COLS + ["prediction"]].copy()
        df["lab_id"] = lab_id
        df["action"] = behavior
        frame_preds.append(df)

    if not frame_preds:
        raise RuntimeError(
            f"Không có OOF predictions nào cho lab_id={target_lab_id}"
        )

    frame_df = pd.concat(frame_preds, ignore_index=True)

    # sắp xếp
    frame_df = frame_df.sort_values(
        ["lab_id", "video_id", "agent_id", "target_id", "action", "video_frame"]
    ).reset_index(drop=True)

    # Convert frame-level prob -> hard label + segments
    segments = []
    for (lab_id, video_id, agent_id, target_id, action), group in frame_df.groupby(
        ["lab_id", "video_id", "agent_id", "target_id", "action"], sort=False
    ):
        frames = group["video_frame"].values
        scores = group["prediction"].values

        # dùng một threshold fix (vd 0.5) cho demo
        # (hoặc bạn có thể lưu threshold per (lab,behavior) và apply)
        hard = scores >= 0.5

        in_seg = False
        start = None
        prev_f = None

        for f, h in zip(frames, hard):
            if h and not in_seg:
                in_seg = True
                start = int(f)
            elif (not h) and in_seg:
                stop = int(prev_f + 1)  # [start, stop)
                segments.append(
                    {
                        "lab_id": lab_id,
                        "video_id": int(video_id),
                        "agent_id": int(agent_id),
                        "target_id": int(target_id),
                        "action": action,
                        "start_frame": start,
                        "stop_frame": stop,
                    }
                )
                in_seg = False
            prev_f = f

        if in_seg:
            stop = int(frames[-1] + 1)
            segments.append(
                {
                    "lab_id": lab_id,
                    "video_id": int(video_id),
                    "agent_id": int(agent_id),
                    "target_id": int(target_id),
                    "action": action,
                    "start_frame": start,
                    "stop_frame": stop,
                }
            )

    if not segments:
        return pd.DataFrame(
            columns=[
                "lab_id",
                "video_id",
                "agent_id",
                "target_id",
                "action",
                "start_frame",
                "stop_frame",
            ]
        )

    submission = pd.DataFrame(segments)
    submission = submission.sort_values(
        ["lab_id", "video_id", "agent_id", "target_id", "action", "start_frame"]
    ).reset_index(drop=True)

    return submission

BAD_VIDEOS = [143861384, 1596473327, 1212811043, 878123481]

def compute_validation_score(
    submission: pd.DataFrame,
    lab_id: Optional[str] = None,
) -> float:
    """
    Gọi metric `score()` chính thức trên train set.
    Nếu lab_id != None → chỉ validate trên lab đó.
    """
    # ===== THAY ĐỔI Ở ĐÂY =====
    # Không dùng train.csv, mà phải đọc toàn bộ annotations
    train_meta = pd.read_csv(INPUT_DIR / "train.csv")
    
    if lab_id is not None:
        train_meta = train_meta[train_meta["lab_id"] == lab_id].reset_index(drop=True)

    if BAD_VIDEOS:
        train_meta = train_meta[~train_meta["video_id"].isin(BAD_VIDEOS)]
    
    # Đọc tất cả annotation files
    all_annotations = []
    for _, row in train_meta.iterrows():
        lab = row["lab_id"]
        vid = row["video_id"]
        ann = load_annotation(lab, vid)
        if not ann.empty:
            ann["lab_id"] = lab
            ann["video_id"] = vid
            ann["behaviors_labeled"] = row["behaviors_labeled"]
            all_annotations.append(ann)
    
    if not all_annotations:
        print("Không có annotation nào để validate!")
        return 0.0
    
    dataset = pd.concat(all_annotations, ignore_index=True)
    
    # Filter submission theo lab nếu cần
    if lab_id is not None:
        submission = submission[submission["lab_id"] == lab_id].reset_index(drop=True)
    
    # ===== GỌI METRIC =====
    s = score(dataset, submission, row_id_column_name="row_id")

    print(
        f"Official validation score"
        f"{' (lab=' + lab_id + ')' if lab_id is not None else ''}: {s:.6f}"
    )
    return float(s)



# =========================================================
# 8. MAIN
# =========================================================
def str_to_mouse_id(s: str) -> int:
    if s == "self":
        return -1
    return int(str(s).replace("mouse", ""))


def predict_behaviors_for_pair(
    lab_id: str,
    video_id: int,
    agent_internal_id: Any,
    target_internal_id: Any,
    behaviors: List[str],
    test_meta: pd.DataFrame,
) -> pd.DataFrame:
    """
    Chạy inference cho 1 cặp (video, agent_internal_id, target_internal_id)
    với list behaviors (cùng mode: all self hoặc all pair).
    Trả về segment-level DataFrame: video_id, action, start_frame, stop_frame.
    """
    if lab_id != "PleasantMeerkat": return None
    frames, feat_df = get_frame_features_for_pair_test(
        lab_id=lab_id,
        video_id=video_id,
        agent_id=agent_internal_id,
        target_id=target_internal_id,
        test_meta=test_meta,
    )
    if feat_df.empty:
        return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])

    feat_df = feat_df.astype("float32")
    n_frames = len(feat_df)

    scores_per_behavior = {}
    for behavior in behaviors:
        models = load_models_for_behavior_infer(lab_id, behavior)
        if not models:
            continue

        req_feats = models[0][0].feature_names
        # Build X_test với đúng bộ feature của model
        X_test = pd.DataFrame(
            0.0,
            index=feat_df.index,
            columns=req_feats,
            dtype=np.float32,
        )
        common = list(set(req_feats) & set(feat_df.columns))
        if common:
            X_test[common] = feat_df[common]

        dtest = xgb.DMatrix(X_test, feature_names=req_feats)

        agg_scores = np.zeros(n_frames, dtype=np.float32)
        for booster, thr in models:
            probs = booster.predict(dtest)
            labels = (probs >= thr).astype(np.int8)
            agg_scores += probs * labels

        agg_scores /= max(len(models), 1)
        scores_per_behavior[behavior] = agg_scores

        del dtest, X_test
        gc.collect()

    if not scores_per_behavior:
        return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])

    beh_list = list(scores_per_behavior.keys())
    score_mat = np.vstack([scores_per_behavior[b] for b in beh_list]).T  # [F, B]

    max_idx = score_mat.argmax(axis=1)
    max_scores = score_mat.max(axis=1)
    labels = np.where(max_scores == 0.0, "none", np.array(beh_list)[max_idx])

    # frame-level → segment
    segments = []
    prev_lab = "none"
    prev_start = None
    prev_f = None

    for f, lab in zip(frames, labels):
        if lab != prev_lab:
            if prev_lab != "none":
                segments.append(
                    {
                        "video_id": int(video_id),
                        "action": prev_lab,
                        "start_frame": int(prev_start),
                        "stop_frame": int(prev_f + 1),
                    }
                )
            prev_lab = lab
            prev_start = f
        prev_f = f

    if prev_lab != "none":
        segments.append(
            {
                "video_id": int(video_id),
                "action": prev_lab,
                "start_frame": int(prev_start),
                "stop_frame": int(prev_f + 1),
            }
        )

    if not segments:
        return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])

    return pd.DataFrame(segments)



target_lab = "PleasantMeerkat"
print(f"Đọc test.csv cho lab {target_lab} ...")
test_meta = pd.read_csv(INPUT_DIR / "test.csv")
test_meta = test_meta[test_meta["lab_id"] == target_lab].reset_index(drop=True)

# Lấy danh sách behavior đã train (thư mục con trong RESULTS_DIR/AdaptableSnail)
lab_result_dir = RESULTS_DIR / target_lab
if lab_result_dir.exists():
    trained_behaviors = sorted(
        [p.name for p in lab_result_dir.iterdir() if p.is_dir()]
    )
else:
    trained_behaviors = []

self_behaviors_in_lab = [b for b in trained_behaviors if b in SELF_BEHAVIORS]
pair_behaviors_in_lab = [b for b in trained_behaviors if b in PAIR_BEHAVIORS]

print("Behaviors (self) dùng để predict:", self_behaviors_in_lab)
print("Behaviors (pair) dùng để predict:", pair_behaviors_in_lab)

all_segments = []

# Loop từng video test của lab
for video_id in sorted(test_meta["video_id"].unique()):
    print(f"Predict video_id={video_id} ...")

    tracking = load_tracking_test(target_lab, video_id)
    mouse_ids_internal = sorted(tracking["mouse_id"].unique().tolist())

    # Map internal mouse_id -> string để đưa vào submission
    def to_submit_id(mid):
        s = str(mid)
        return s if s.startswith("mouse") else f"mouse{s}"

    # SELF behaviors: agent == target (self)
    if self_behaviors_in_lab:
        for mid in mouse_ids_internal:
            seg_df = predict_behaviors_for_pair(
                lab_id=target_lab,
                video_id=video_id,
                agent_internal_id=mid,
                target_internal_id=mid,  # self
                behaviors=self_behaviors_in_lab,
                test_meta=test_meta,
            )
            if not seg_df.empty:
                seg_df["agent_id"] = to_submit_id(mid)
                seg_df["target_id"] = "self"
                all_segments.append(seg_df)

    # PAIR behaviors: mọi cặp agent != target
    if pair_behaviors_in_lab and len(mouse_ids_internal) > 1:
        for agent_internal, target_internal in itertools.permutations(
            mouse_ids_internal, 2
        ):
            seg_df = predict_behaviors_for_pair(
                lab_id=target_lab,
                video_id=video_id,
                agent_internal_id=agent_internal,
                target_internal_id=target_internal,
                behaviors=pair_behaviors_in_lab,
                test_meta=test_meta,
            )
            if not seg_df.empty:
                seg_df["agent_id"] = to_submit_id(agent_internal)
                seg_df["target_id"] = to_submit_id(target_internal)
                all_segments.append(seg_df)

# Gộp tất cả segments → submission.csv
# Gộp tất cả segments → submission2.csv
if all_segments:
    submission6 = pd.concat(all_segments, ignore_index=True)
    submission6 = submission6[
        ["video_id", "agent_id", "target_id", "action", "start_frame", "stop_frame"]
    ]
    submission6 = submission6.sort_values(
        ["video_id", "agent_id", "target_id", "action", "start_frame"]
    ).reset_index(drop=True)
else:
    # DataFrame rỗng, KHÔNG dummy row
    submission6 = pd.DataFrame(
        columns=[
            "video_id",
            "agent_id",
            "target_id",
            "action",
            "start_frame",
            "stop_frame",
        ]
    )

# Thêm row_id (kể cả khi rỗng)
submission6.insert(0, "row_id", np.arange(len(submission6), dtype=np.int64))

sub_path = WORKING_DIR / "submission6.csv"
submission6.to_csv(sub_path, index=False)
print(f"Saved PleasantMeerkat submission to {sub_path}")



Đọc test.csv cho lab PleasantMeerkat ...
Behaviors (self) dùng để predict: []
Behaviors (pair) dùng để predict: ['attack', 'chase', 'escape', 'follow']
Saved PleasantMeerkat submission to /kaggle/working/submission6.csv


# SparklingTapir

In [12]:
from pathlib import Path
import shutil
import gc

WORKING_DIR = Path("/kaggle/working")

# 1) Xóa mọi thứ trong /kaggle/working trừ .csv
for path in WORKING_DIR.iterdir():
    # giữ lại file .csv
    if path.is_file() and path.suffix == ".csv":
        continue

    if path.is_file():
        try:
            path.unlink()
        except Exception as e:
            print(f"Cannot remove file {path}: {e}")
    elif path.is_dir():
        try:
            shutil.rmtree(path, ignore_errors=True)
        except Exception as e:
            print(f"Cannot remove dir {path}: {e}")


gc.collect()

77

In [13]:
from __future__ import annotations
from typing import Dict, List, Tuple, Any, Optional
import warnings
from dataclasses import dataclass, field
import numpy as np
import pandas as pd
from scipy.ndimage import gaussian_filter1d
from tqdm import tqdm

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
np.seterr(invalid="ignore", divide="ignore")

# =============================================================================
# 1. CONFIGURATION
# =============================================================================
@dataclass
class FeatureConfig:
    """
    Chứa cấu hình tham số (Hyperparameters).
    """
    fps: float = 30.0
    pix_per_cm: float = 1.0
    smooth_sigma: float = 1.0
    use_pairwise: bool = True


# =============================================================================
# 2. AGENT CONTEXT
# =============================================================================
@dataclass
class AgentContext:
    """
    Container chứa dữ liệu đã tiền xử lý của một con chuột.
    Giúp tránh việc tính toán lại vận tốc/gia tốc nhiều lần.
    """
    idx: pd.Index          # Index frame
    pos: np.ndarray        # [F, 2] cm
    vel: np.ndarray        # [F, 2] cm/s
    speed: np.ndarray      # [F, 1] cm/s
    acc: np.ndarray        # [F, 2] cm/s^2
    
    cx: pd.Series          # Series tọa độ X (để dùng rolling)
    cy: pd.Series          # Series tọa độ Y
    speed_series: pd.Series # Series tốc độ
    
    raw_df: Optional[pd.DataFrame] = None # Dữ liệu gốc các bộ phận 


# =============================================================================
# 3. FEATURE EXTRACTOR
# =============================================================================
class FeatureExtractor:
    """
    Class trích xuất đặc trưng hành vi từ dữ liệu tracking.
    """
    def __init__(self, fps: float, pix_per_cm: float, smooth_sigma: float = 1.0, use_pairwise: bool = True):
        # Map tham số từ init vào Config
        self.cfg = FeatureConfig(
            fps=float(fps), 
            pix_per_cm=float(pix_per_cm), 
            smooth_sigma=smooth_sigma,
            use_pairwise=use_pairwise
        )
        
        # Đăng ký các hàm feature sẽ chạy
        self.feature_registry = {
            "kinematics": self._feat_basic_kinematics,
            "multiscale": self._feat_multiscale,
            "long_range": self._feat_long_range,
            "cumulative": self._feat_cumulative,
            "curvature": self._feat_curvature,
            "speed_asym": self._feat_speed_asym,
            "gauss_shift": self._feat_gauss_shift,
            "pose": self._feat_pose_shape,
            "a": self._feat_attack_defend,
            "follow": self._feat_follow_pattern,
            "short": self._feat_shortburst_social,
            "pairwise": self._feat_pairwise
        }

    # --- Helpers ---
    def _scale(self, n_frames_30fps: int) -> int:
        """Quy đổi số frame từ chuẩn 30fps sang fps thực tế của video."""
        return max(1, int(round(n_frames_30fps * self.cfg.fps / 30.0)))

    def _to_cm(self, arr):
        """Chuyển pixel -> cm."""
        return arr / self.cfg.pix_per_cm

    def _smooth(self, x):
        """Làm mượt dữ liệu bằng Gaussian filter."""
        if self.cfg.smooth_sigma is None or x.shape[0] < 3: return x
        if np.all(np.isnan(x)): return x
        return gaussian_filter1d(x, sigma=self.cfg.smooth_sigma, axis=0, mode="nearest")

    def _forward_fill_nan(self, pos):
        """
        Điền dữ liệu thiếu (NaN) bằng giá trị hợp lệ trước đó (Forward Fill).
        """
        if np.all(np.isnan(pos)):
            return np.zeros_like(pos)

        pos_ffill = pos.copy()
        mask = np.any(~np.isnan(pos_ffill), axis=1)
        if not mask.any():
            return np.zeros_like(pos_ffill)

        valid_idx = np.where(mask)[0]
        first, last = valid_idx[0], valid_idx[-1]
        pos_ffill[:first] = pos_ffill[first]
        pos_ffill[last + 1:] = pos_ffill[last]
        df_temp = pd.DataFrame(pos_ffill)
        df_temp = df_temp.ffill()
        return df_temp.to_numpy()
    
    def _speed_series(self, cx: pd.Series, cy: pd.Series) -> pd.Series:
        dx = cx.diff()
        dy = cy.diff()
        v = np.hypot(dx, dy).fillna(0.0) * self.cfg.fps
        return v.astype("float32")
    
    def _roll_future_mean(self, s: pd.Series, w: int, min_p: int = 1) -> pd.Series:
        return s.iloc[::-1].rolling(w, min_periods=min_p).mean().iloc[::-1]

    def _roll_future_var(self, s: pd.Series, w: int, min_p: int = 2) -> pd.Series:
        return s.iloc[::-1].rolling(w, min_periods=min_p).var().iloc[::-1]

    # --- Core Logic ---
    def _compute_kinematics(self, pos_px: np.ndarray):
        """
        Tính toán vật lý cơ bản: Pos(cm), Vel, Speed, Acc.
        Input: Array [Frames, 2] (pixel).
        Output: Tuple (pos_cm, vel, speed, acc).
        """
        pos_ffill = self._forward_fill_nan(pos_px)
        pos_cm = self._to_cm(pos_ffill.astype(np.float32))
        pos_cm = self._smooth(pos_cm)                                               # [F, 2]

        dt = 1.0 / self.cfg.fps
        vel = np.zeros_like(pos_cm, dtype=np.float32)
        vel[1:] = (pos_cm[1:] - pos_cm[:-1]) / dt                                   # [F, 2: (vx, vy)]
        speed = np.linalg.norm(vel, axis=1, keepdims=True).astype(np.float32)       # [F, 1]

        acc = np.zeros_like(pos_cm, dtype=np.float32)                          
        acc[1:] = (vel[1:] - vel[:-1]) / dt                                         # [F, 2:(ax, ay)]
        return pos_cm.astype(np.float32), vel, speed, acc

    def _build_context(self, frames, pos_px, mouse_df=None) -> AgentContext:
        """
        Tạo AgentContext chứa đầy đủ thông tin vật lý của 1 con chuột.
        """
        p, v, s, a = self._compute_kinematics(pos_px)
        idx = pd.Index(frames, name="frame")
        
        return AgentContext(
            idx=idx, pos=p, vel=v, speed=s, acc=a, 
            cx=pd.Series(p[:, 0], index=idx), 
            cy=pd.Series(p[:, 1], index=idx), 
            speed_series=pd.Series(s[:, 0], index=idx), 
            raw_df=mouse_df
        )

    # --- Feature Modules ---
    def _feat_basic_kinematics(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Lấy các giá trị thô: tọa độ x, y, vận tốc vx, vy, tốc độ, gia tốc ax, ay.
        """
        return {
            "a_x": ctx.pos[:, 0], "a_y": ctx.pos[:, 1],
            "a_vx": ctx.vel[:, 0], "a_vy": ctx.vel[:, 1],
            "a_speed": ctx.speed[:, 0],
            "a_ax": ctx.acc[:, 0], "a_ay": ctx.acc[:, 1]
        }

    def _feat_multiscale(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Tính tốc độ trung bình (Mean) và độ lệch chuẩn (Std) ở đa mức thời gian.
        Feature 'sp_ratio' đo độ bùng nổ (Burstiness).
        """
        feats = {}
        speed = ctx.speed_series
        frame_scales = [10, 40, 160]
        for scale in frame_scales:
            ws = self._scale(scale)
            if len(speed) >= ws:
                roller = speed.rolling(ws, min_periods=max(1, ws//4), center=True)
                feats[f"sp_m{scale}"] = roller.mean().astype("float32")
                feats[f"sp_s{scale}"] = roller.std().astype("float32")
        feats[f"sp_ratio"] = feats["sp_m10"] / (feats["sp_m160"] + 1e-6)
        return feats 
        
    def _feat_long_range(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Đặc trưng ngữ cảnh dài hạn:
        - x_ml, y_ml: Vị trí trung bình trong quá khứ.
        - sp_pct: Xếp hạng (percentile) của tốc độ hiện tại so với quá khứ.
        """
        feats: Dict[str, pd.Series] = {}
        speed = ctx.speed_series

        for window in [120, 240]:
            ws = self._scale(window)
            if len(ctx.cx) >= ws:
                feats[f"x_ml{window}"] = ctx.cx.rolling(ws, min_periods=max(5, ws // 6), center=True).mean()
                feats[f"y_ml{window}"] = ctx.cy.rolling(ws, min_periods=max(5, ws // 6), center=True).mean()

        for span in [60, 120]:
            s = self._scale(span)
            feats[f"x_e{span}"] = ctx.cx.ewm(span=s, min_periods=1).mean()
            feats[f"y_e{span}"] = ctx.cy.ewm(span=s, min_periods=1).mean()

        for window in [60, 120]:
            ws = self._scale(window)
            if len(speed) >= ws:
                feats[f"sp_pct{window}"] = speed.rolling(
                    ws, min_periods=max(5, ws // 6), center=True
                ).rank(pct=True)
        return feats
    

    def _feat_curvature(self, ctx: AgentContext, **kwargs) -> Dict:
        feats = {}

        vel_x, vel_y = ctx.vel[:, 0], ctx.vel[:, 1]
        acc_x, acc_y = ctx.acc[:, 0], ctx.acc[:, 1]
        cross_prod = vel_x * acc_y - vel_y * acc_x
        vel_mag = np.sqrt(vel_x**2 + vel_y**2)
        moving_mask = vel_mag > 2.0
        vel_mag_safe = np.maximum(vel_mag, 0.1 / self.cfg.fps)
        raw_curv = cross_prod / (vel_mag_safe**3)
        raw_curv = np.where(moving_mask, raw_curv, 0.0)
        min_turn_radius_cm = 0.5
        max_k = 1.0 / min_turn_radius_cm
        raw_curv = np.clip(raw_curv, -max_k, max_k)
        abs_curv = np.abs(raw_curv)
        abs_curv_series = pd.Series(abs_curv, index=ctx.idx)

        for w in [30, 60]:
            ws = self._scale(w)
            min_p = max(ws // 3, 1)
            feats[f"curv_mean_{w}"] = abs_curv_series.rolling(ws, min_periods=min_p).mean()

        angle = np.arctan2(vel_y, vel_x)
        angle_series = pd.Series(angle, index=ctx.idx)
        angle_change = np.abs(angle_series.diff().fillna(0.0))
        angle_change = np.where(angle_change > np.pi, 2 * np.pi - angle_change, angle_change)
        angle_change_series = pd.Series(angle_change, index=ctx.idx)
        angle_change_series = pd.Series(np.where(moving_mask, angle_change_series, 0.0), index=ctx.idx)

        ws = self._scale(30)
        feats["turn_rate_30"] = angle_change_series.rolling(ws, min_periods=max(ws // 3, 1)).sum()

        return feats
    
    def _feat_cumulative(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Tổng quãng đường di chuyển trong một khoảng thời gian dài xung quanh frame hiện tại.
        """
        feats = {}
        L = max(1, self._scale(180))
        step = np.hypot(ctx.cx.diff(), ctx.cy.diff()).fillna(0.0)
        path = step.rolling(2 * L + 1, min_periods=max(5, L // 6), center=True).sum()
        feats["path_cum180"] =  path.fillna(0.0).astype("float32")
        return feats

    def _feat_speed_asym(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Bất đối xứng tốc độ (Tương lai - Quá khứ).
        """
        w = max(3, self._scale(30))
        v = ctx.speed_series
        v_past = v.rolling(w, min_periods=1).mean()
        v_fut = self._roll_future_mean(v, w, min_p=1)
        return {"spd_asym_1s": (v_fut - v_past).fillna(0.0)}
    
    def _feat_gauss_shift(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Độ lệch Gaussian (KL Divergence) giữa quá khứ và tương lai.
        Đo lường sự thay đổi trạng thái thống kê.
        """
        w = max(5, self._scale(30))
        v = ctx.speed_series
        mu_p = v.rolling(w, min_periods=1).mean()
        va_p = v.rolling(w, min_periods=1).var().clip(lower=1e-6)
        mu_f = self._roll_future_mean(v, w, min_p=1)
        va_f = self._roll_future_var(v, w, min_p=1).clip(lower=1e-6)

        kl_pf = 0.5 * (
            (va_p / va_f) + ((mu_f - mu_p) ** 2) / va_f - 1.0 + np.log(va_f / va_p)
        )
        kl_fp = 0.5 * (
            (va_f / va_p) + ((mu_p - mu_f) ** 2) / va_p - 1.0 + np.log(va_p / va_f)
        )
        return {
            "spd_symkl_1s": (kl_pf + kl_fp).replace([np.inf, -np.inf], np.nan).fillna(0.0)
        }

    def _feat_avoidance_trajectory(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        """
        Tính toán quỹ đạo né tránh:
        1. Relative Heading: Góc di chuyển so với hướng tới đối thủ.
        2. Future Distance Gain: Dự báo xem hành động này có giúp chuột ra xa đối thủ trong tương lai không.
        """
        feats = {}
        if target_ctx is None: 
            return feats

        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")

        # --- 1. RELATIVE HEADING (Góc lệch hướng đi) ---
        # Vector từ Tôi -> Địch
        rel_vec = target_ctx.pos - ctx.pos
        # Góc hướng tới địch (Angle to Target)
        angle_to_target = np.arctan2(rel_vec[:, 1], rel_vec[:, 0])
        
        # Góc di chuyển của Tôi (My Heading)
        my_heading = np.arctan2(ctx.vel[:, 1], ctx.vel[:, 0])
        
        # Độ lệch góc (Absolute Difference)
        # Cần xử lý wrap góc (ví dụ: lệch giữa 179 độ và -179 độ là 2 độ chứ ko phải 358)
        diff = np.abs(angle_to_target - my_heading)
        diff = np.minimum(diff, 2*np.pi - diff) # Chuẩn hóa về [0, pi]
        
        # Feature: Cosine của góc lệch
        # 1.0 (0 độ) -> Lao vào
        # 0.0 (90 độ) -> AVOID (Lách ngang)
        # -1.0 (180 độ) -> Escape
        feats["heading_rel_cos"] = pd.Series(np.cos(diff), index=idx, dtype="float32")
        
        # Feature: Góc lệch tuyệt đối (đổi ra độ cho dễ hình dung nếu cần, ở đây để rad)
        feats["heading_rel_abs"] = pd.Series(diff, index=idx, dtype="float32")


        # --- 2. FUTURE DISTANCE GAIN (Hiệu quả tránh né) ---
        # "Sau 15 frame (0.5s) hoặc 30 frame (1s), mình có xa nó ra không?"
        
        dist_now = np.linalg.norm(rel_vec, axis=1)
        s_dist = pd.Series(dist_now, index=idx)
        
        scales = [15, 30] # 0.5s và 1s
        for w in scales:
            ws = self._scale(w)
            
            # Lấy khoảng cách ở tương lai (shift ngược lên)
            # s.shift(-ws) là giá trị của t + ws
            dist_future = s_dist.shift(-ws)
            gain = dist_future - s_dist
            
            feats[f"dist_gain_{w}f"] = gain.fillna(0.0).astype("float32")

        return feats
    
    def _extract_part(self, ctx: AgentContext, part: str) -> Optional[np.ndarray]:
        if ctx.raw_df is None: return None
        if part not in ctx.raw_df.columns.get_level_values(0): return None
        try:
            sub_df = ctx.raw_df.xs(part, axis=1, level=0)[["x", "y"]].reindex(ctx.idx)
        except KeyError: return None
        raw = sub_df.to_numpy()
        raw = self._forward_fill_nan(raw)
        cm = self._to_cm(raw.astype(np.float32))
        return self._smooth(cm)
    
    def _extract_parts_dict(self, ctx: AgentContext, parts: List[str] = None) -> Dict[str, Optional[np.ndarray]]:
        out = {}
        for p in parts:
            out[p] = self._extract_part(ctx, p)
        return out
        
    def _feat_pose_shape(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Placeholder cho các đặc trưng hình dáng (Elongation, Body Angle...).
        """
        feats = {}

        def zero(): return pd.Series(0.0, index=ctx.idx, dtype="float32")

        def dist(k1, k2):
            p1, p2 = parts.get(k1), parts.get(k2)
            if p1 is None or p2 is None: return zero()
            d = np.linalg.norm(p1 - p2, axis=1)
            return pd.Series(d, index=ctx.idx, dtype="float32")
        
        def body_angle():
            if parts.get("nose") is None: return zero()
            if parts.get("body_center") is None: return zero()
            if parts.get("tail_base") is None: return zero()

            v1 = parts.get("nose") - parts.get("body_center")
            v2 = parts.get("tail_base") - parts.get("body_center")
            dot_product = np.sum(v1 * v2, axis=1)
            mag = np.linalg.norm(v1, axis=1) * np.linalg.norm(v2, axis=1)
            cos_angle = np.clip(dot_product / (mag + 1e-6), -1.0, 1.0).astype("float32")
            return cos_angle
        
        def elongation():
            if parts.get("nose")          is None: return zero()
            if parts.get("tail_base")     is None: return zero()
            if parts.get("lateral_left")  is None: return zero()
            if parts.get("lateral_right") is None: return zero()

            d1 = dist("nose", "tail_base")
            d2 = dist("lateral_left", "lateral_right")
            elongation = d1 / (d2 + 1e-6).astype("float32")
            return elongation
        
        def vel(part: str, n_frames_30fps: int) -> Dict:
            part_pos = self._extract_part(ctx, part)
            if part_pos is None: return zero()
            
            s_x = pd.Series(part_pos[:, 0], index=ctx.idx)
            s_y = pd.Series(part_pos[:, 1], index=ctx.idx)
            raw_speed = self._speed_series(s_x, s_y)

            ws = self._scale(n_frames_30fps)
            val = raw_speed.rolling(ws, min_periods=1, center=True).mean()
            return val.astype("float32")


        target_parts = ["nose", "neck", "body_center", "tail_base", 
                        "ear_left", "ear_right", 
                        "lateral_left", "lateral_right", "tail_midpoint", "tail_tip"]
        
        parts = self._extract_parts_dict(ctx, target_parts)

        feats["a_body_width"]                = dist("lateral_left", "lateral_right")
        feats["aa_nose_bodycenter_dist"]     = dist("nose", "body_center")
        #feats["aa_nose_tailbase_dist"]       = dist("nose", "tail_base")
        #feats["aa_bodycenter_tailbase_dist"] = dist("body_center", "tail_base")
        
        feats["aa_bodycenter_ear_l_dist"]    = dist("body_center", "ear_left")
        feats["aa_bodycenter_ear_r_dist"]    = dist("body_center", "ear_right")
        feats["aa_bodycenter_lateral_l_dist"]= dist("body_center", "lateral_left")
        feats["aa_bodycenter_lateral_r_dist"]= dist("body_center", "lateral_right")
        
        feats["a_body_angle"]                = body_angle()
        feats["a_elongation"]                = elongation()
        feats["a_tail_base_vel_500ms"]       = vel("tail_base", 15)
        feats["a_tail_base_vel_1000ms"]      = vel("tail_base", 30)
        feats["a_tail_base_vel_2000ms"]      = vel("tail_base", 60)
        feats["a_tail_base_vel_3000ms"]      = vel("tail_base", 90)
        feats["a_nose_vel_500ms"]            = vel("nose", 15)
        feats["a_nose_vel_1000ms"]           = vel("nose", 30)
        feats["a_nose_vel_2000ms"]           = vel("nose", 60)
        feats["a_nose_vel_3000ms"]           = vel("nose", 90)
        # feats["a_ear_right_vel_500ms"]       = vel("ear_right", 15)
        # feats["a_ear_right_vel_1000ms"]      = vel("ear_right", 30)
        # feats["a_ear_right_vel_2000ms"]      = vel("ear_right", 60)
        # feats["a_ear_right_vel_3000ms"]      = vel("ear_right", 90)
        # len_1 = dist("tail_base", "tail_midpoint")
        # len_2 = dist("tail_midpoint", "tail_tip")
        # len_full = dist("tail_base", "tail_tip")
        # feats["tail_curl"] = ((len_1 + len_2) / (len_full + 1e-6)).astype("float32")
        return feats

    def _feat_attack_defend(
        self,
        ctx: AgentContext,
        target_ctx: AgentContext = None,
        **kwargs
    ) -> Dict[str, pd.Series]:
        """
        Feature chuyên để phân biệt attack vs defend cho cặp chuột.
    
        - attack: cả hai chuyển động mạnh, speed & biến thiên speed lớn,
                  khoảng cách nhỏ, đổi hướng loạn xạ.
        - defend: agent đứng gần đối thủ, speed thấp, quay mặt về phía đối thủ.
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
    
        def zero() -> pd.Series:
            return pd.Series(0.0, index=idx, dtype="float32")
    
        # -----------------------------
        # 1. SPEED & BIẾN THIÊN SPEED
        # -----------------------------
        v_a = ctx.speed_series        # agent speed (cm/s)
        v_t = target_ctx.speed_series # target speed
    
        ws_short = self._scale(10)  # ~0.3s
        mp_short = max(ws_short // 3, 1)
    
        def roll_mean(s: pd.Series) -> pd.Series:
            return (
                s.rolling(ws_short, min_periods=mp_short)
                 .mean()
                 .fillna(0.0)
                 .astype("float32")
            )
    
        def roll_std(s: pd.Series) -> pd.Series:
            return (
                s.rolling(ws_short, min_periods=mp_short)
                 .std()
                 .fillna(0.0)
                 .astype("float32")
            )
    
        a_spd_mean = roll_mean(v_a)
        t_spd_mean = roll_mean(v_t)
        a_spd_std  = roll_std(v_a)
        t_spd_std  = roll_std(v_t)
    
        feats["atk_a_speed_mean_10"] = a_spd_mean
        feats["atk_t_speed_mean_10"] = t_spd_mean
        feats["atk_a_speed_std_10"]  = a_spd_std
        feats["atk_t_speed_std_10"]  = t_spd_std
    
        # "Violence" = tổng biến thiên speed hai bên
        feats["atk_speed_violence_10"] = (
            a_spd_std + t_spd_std
        ).astype("float32")
    
        # -------------------------------------------------
        # 2. RELATIVE DISTANCE & STABILITY
        # -------------------------------------------------
        # dùng rel_dist từ pairwise nếu có, còn không thì tính lại
        rel_vec = target_ctx.pos - ctx.pos
        rel_dist = np.linalg.norm(rel_vec, axis=1).astype("float32")
        rel_dist_s = pd.Series(rel_dist, index=idx, dtype="float32")
        feats["atk_rel_dist"] = rel_dist_s
    
        # deviance distance (attack: distance thay đổi nhanh, defend: ổn định)
        rel_dist_diff = rel_dist_s.diff().abs().fillna(0.0)
        feats["atk_rel_dist_diff_abs"] = rel_dist_diff.astype("float32")
    
        # -------------------------------------------------
        # 3. TURNING / DIRECTION CHANGE (cho "loạn xạ")
        # -------------------------------------------------
        vx_a, vy_a = ctx.vel[:, 0], ctx.vel[:, 1]
        angle_a = np.arctan2(vy_a, vx_a)
        angle_a = pd.Series(angle_a, index=idx)
        dtheta = angle_a.diff().fillna(0.0).abs()
        dtheta = np.where(dtheta > np.pi, 2 * np.pi - dtheta, dtheta)
        dtheta = pd.Series(dtheta, index=idx)
    
        feats["atk_agent_turn_rate_10"] = (
            dtheta.rolling(ws_short, min_periods=mp_short)
                  .sum()
                  .fillna(0.0)
                  .astype("float32")
        )
    
        # -------------------------------------------------
        # 4. ORIENTATION CHO DEFEND (quay đầu khè)
        # -------------------------------------------------
        # body vector agent: tail_base -> nose
        parts_a = self._extract_parts_dict(
            ctx,
            ["nose", "tail_base"],
        )
        a_nose = parts_a.get("nose")
        a_tail = parts_a.get("tail_base")
    
        # dùng target body_center làm "thân" để đi tới
        parts_t = self._extract_parts_dict(
            target_ctx,
            ["body_center"],
        )
        t_body = parts_t.get("body_center")
    
        if a_nose is not None and a_tail is not None and t_body is not None:
            body_vec_a = a_nose - a_tail  # tail -> head (agent)
            vec_to_target = t_body - a_tail
    
            dot = np.sum(body_vec_a * vec_to_target, axis=1)
            mag = np.linalg.norm(body_vec_a, axis=1) * np.linalg.norm(vec_to_target, axis=1)
    
            cos_orient = np.zeros_like(dot, dtype="float32")
            valid = mag > 1e-3
            cos_orient[valid] = np.clip(dot[valid] / mag[valid], -1.0, 1.0)
    
            feats["def_body_facing_cos"] = pd.Series(cos_orient, index=idx, dtype="float32")
        else:
            feats["def_body_facing_cos"] = zero()
    
        # Ý nghĩa:
        #   +1 ~ agent tail->head hướng thẳng tới target (đối mặt/khè)
        #   0  ~ vuông góc
        #   -1 ~ quay lưng

        # -------------------------------------------------
        # 5. DEFENSIVE PATTERN: ĐỨNG GẦN, ÍT DI CHUYỂN, QUAY MẶT
        # -------------------------------------------------
        # score mềm: high nếu "đứng giữ vị trí + facing"
        near_mask = (rel_dist_s < 5.0).astype("float32")  # <5cm tùy lab chỉnh
        low_speed = (a_spd_mean < 3.0).astype("float32")  # cm/s, chỉnh tùy fps
    
        def_cos = feats["def_body_facing_cos"]
    
        feats["def_posture_score"] = (
            near_mask * low_speed * (0.5 * (def_cos + 1.0))
        ).astype("float32")
        # def_posture_score ~ 1: gần, chậm, đang đối mặt
    
        # -------------------------------------------------
        # 6. CLEAN NaN / Inf
        # -------------------------------------------------
        for k, v in feats.items():
            feats[k] = (
                v.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats


    
    def _feat_follow_pattern(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        """
        Đặc trưng hành vi FOLLOW:
          - Agent ở gần target
          - Cùng hướng (body + velocity)
          - Tốc độ vừa phải
          - Khoảng cách tương đối ổn định trong 0.5–1s
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- 1. CÁC ĐẠI LƯỢNG CƠ BẢN ---
        # Vector Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        rel_dist = np.linalg.norm(rel_vec, axis=1)
        rel_dist_s = pd.Series(rel_dist, index=idx, dtype="float32")
    
        # Speed agent/target
        a_speed = ctx.speed_series.astype("float32")
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32",
        )
    
        # Body vector: nose - tail/body_center
        parts_a = self._extract_parts_dict(ctx, ["nose", "tail_base", "ear_left", "ear_right"])
        parts_t = self._extract_parts_dict(target_ctx, ["nose", "tail_base", "ear_right", "ear_left"])
    
        def body_vec(parts_dict):
            head = parts_dict.get("nose")
            tail = parts_dict.get("tail_base")
            if head is None or tail is None:
                return None
            return head - tail
    
        a_body = body_vec(parts_a)
        t_body = body_vec(parts_t)
    
        if a_body is not None and t_body is not None:
            dot_bt = np.sum(a_body * t_body, axis=1)
            mag_bt = np.linalg.norm(a_body, axis=1) * np.linalg.norm(t_body, axis=1)
            cos_body = np.clip(dot_bt / (mag_bt + 1e-6), -1.0, 1.0)
            cos_body_s = pd.Series(cos_body, index=idx, dtype="float32")
        else:
            cos_body_s = zero()
    
        # Velocity hướng
        a_vel = ctx.vel
        t_vel = target_ctx.vel
        a_speed_np = np.linalg.norm(a_vel, axis=1)
        t_speed_np = np.linalg.norm(t_vel, axis=1)
        moving_mask = (a_speed_np > 1e-3) & (t_speed_np > 1e-3)
    
        # cos giữa hướng velocity 2 con
        dot_v = np.sum(a_vel * t_vel, axis=1)
        mag_v = a_speed_np * t_speed_np + 1e-6
        cos_vel = np.zeros_like(dot_v, dtype="float32")
        cos_vel[moving_mask] = np.clip(dot_v[moving_mask] / mag_v[moving_mask], -1.0, 1.0)
        cos_vel_s = pd.Series(cos_vel, index=idx, dtype="float32")
    
        # --- 2. WINDOW NGẮN (FOLLOW LÀ PATTERN DÀI HƠN ATTACK) ---
        for w30 in [15, 30, 60]:   # ~0.5s, 1s, 2s
            ws = self._scale(w30)
            min_p = max(ws // 3, 1)
    
            # Khoảng cách trung bình & độ dao động
            m_dist = rel_dist_s.rolling(ws, min_periods=min_p).mean()
            s_dist = rel_dist_s.rolling(ws, min_periods=min_p).std()
    
            # Cùng hướng (body + velocity)
            m_cos_body = cos_body_s.rolling(ws, min_periods=min_p).mean()
            m_cos_vel  = cos_vel_s.rolling(ws, min_periods=min_p).mean()
    
            # Tốc độ vừa phải
            m_sp_a = a_speed.rolling(ws, min_periods=min_p).mean()
            m_sp_t = t_speed.rolling(ws, min_periods=min_p).mean()
    
            feats[f"follow_dist_mean_{w30}"] = m_dist
            feats[f"follow_dist_std_{w30}"]  = s_dist
            feats[f"follow_cos_body_mean_{w30}"] = m_cos_body
            feats[f"follow_cos_vel_mean_{w30}"]  = m_cos_vel
            feats[f"follow_speed_agent_mean_{w30}"] = m_sp_a
            feats[f"follow_speed_target_mean_{w30}"] = m_sp_t
    
        # Clean
        for k, v in feats.items():
            feats[k] = (
                v.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats

    def _feat_submission_temporal(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        """
        Đặc trưng 'Ký ức sợ hãi' (Fear Memory) để bắt Submit tĩnh.
        Giúp phân biệt Submit (sau khi bị đánh) vs Rest (bình yên).
        """
        feats = {}
        if target_ctx is None: return feats
        
        idx = ctx.idx
        
        # --- 1. XÂY DỰNG TÍN HIỆU XUNG ĐỘT GỐC (RAW CONFLICT SIGNAL) ---
        # Conflict = (Nó nhanh) * (Nó hướng về tôi) * (Ở gần)
        
        # A. Nó hướng về tôi không? (Gaze Cosine)
        # Vector nối Tôi -> Nó
        vec_to_target = target_ctx.pos - ctx.pos
        dist = np.linalg.norm(vec_to_target, axis=1)
        dist_safe = pd.Series(dist, index=idx).replace(0, 1e-6)
        
        # Vector vận tốc của Nó
        t_vel = target_ctx.vel
        
        # Dot product: Vận tốc Nó . Vector hướng về Tôi (Ngược dấu với vec_to_target)
        # vec_target_to_me = -vec_to_target
        # dot > 0 nghĩa là nó đang lao về phía tôi
        dot_threat = np.sum(t_vel * (-vec_to_target), axis=1)
        
        # Threat Score tức thời (cm/s hướng về nạn nhân)
        # Chỉ tính khi nó lại gần (< 15cm)
        threat_raw = (dot_threat / dist_safe).clip(lower=0) 
        threat_raw = threat_raw * (dist_safe < 15.0).astype(float)
        threat_series = pd.Series(threat_raw, index=idx, dtype="float32")

        # --- 2. KÝ ỨC SỢ HÃI (FEAR MEMORY - QUAN TRỌNG NHẤT) ---
        # Dùng Rolling Max để "kéo dài" nỗi sợ.
        # Nếu 2 giây trước nó lao vào tôi, thì giờ tôi vẫn đang sợ.
        
        # Cửa sổ 3 giây (90 frames)
        ws_memory = self._scale(90)
        
        # Fear Level = Max threat trong 3 giây qua
        feats["fear_memory_3s"] = threat_series.rolling(ws_memory, min_periods=1).max().astype("float32")

        # --- 3. TRẠNG THÁI SUBMIT (KẾT HỢP) ---
        # Submit = (Tôi đang đứng yên) * (Tôi đang co cụm) * (Tôi đang sợ)
        
        # Tôi đứng yên (< 1 cm/s)
        my_speed = ctx.speed_series
        is_still = (my_speed < 1.0).astype(float)
        
        # Tôi co cụm (Dùng a_elongation thấp hoặc body_width/length cao)
        # Giả sử bạn đã tính a_elongation ở hàm pose (thấp là co cụm)
        # Nếu chưa có thì dùng tạm logic: elongation < 1.2
        # Ở đây mình tạo feature giả lập độ co cụm nếu chưa có
        parts = self._extract_parts_dict(ctx, ["nose", "tail_base"])
        if parts["nose"] is not None:
            spine_len = np.linalg.norm(parts["nose"] - parts["tail_base"], axis=1)
            is_compact = (spine_len < 8.0).astype(float) # Ví dụ chuột dài < 8cm là co
            is_compact = pd.Series(is_compact, index=idx)
        else:
            is_compact = pd.Series(0.0, index=idx)

        # FINAL SCORE
        # Đây là feature định danh cho Submit tĩnh
        feats["static_submit_prob"] = (
            is_still * is_compact * feats["fear_memory_3s"]
        ).astype("float32")

        return feats


    def _feat_pairwise(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict:
        """
        Đặc trưng tương tác cặp đôi (Pairwise): Khoảng cách, Tốc độ tiếp cận.
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None: 
            return feats

        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")

        # --- 1. KHOẢNG CÁCH CƠ BẢN (DISTANCES) ---
        # Vector nối Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        dist = np.linalg.norm(rel_vec, axis=1)
        feats["rel_dist"] = pd.Series(dist, index=idx, dtype="float32")

        # --- 2. KHOẢNG CÁCH CHI TIẾT (NOSE-TO-PART) ---
        # Lấy các bộ phận quan trọng
        my_parts = self._extract_parts_dict(ctx, ["nose", "tail_base"])
        target_parts = self._extract_parts_dict(target_ctx, 
            ["nose", "tail_base", "body_center", "ear_left", "ear_right", 
             "lateral_left", "lateral_right"])

        def dist_ab(pt_a, pt_b):
            if pt_a is None or pt_b is None: return zero()
            d = np.linalg.norm(pt_a - pt_b, axis=1)
            return pd.Series(d, index=idx, dtype="float32")

        an, tn = my_parts["nose"], target_parts["nose"]
        feats["dist_nose_nose"] = dist_ab(an, tn)
        feats["dist_nose_tail"] = dist_ab(an, target_parts["tail_base"])
        feats["dist_nose_body"] = dist_ab(an, target_parts["body_center"])
        feats["dist_nose_el"]   = dist_ab(an, target_parts["ear_left"])
        feats["dist_nose_er"]   = dist_ab(an, target_parts["ear_right"])
        feats["dist_nose_tll"]  = dist_ab(an, target_parts["lateral_left"])
        feats["dist_nose_tlr"]  = dist_ab(an, target_parts["lateral_right"])
        feats["dist_tail_tail"] = dist_ab(my_parts["tail_base"], target_parts["tail_base"])

        # --- 3. ĐỊNH HƯỚNG & GÓC NHÌN (ORIENTATION & GAZE) ---
        # Helper lấy vector cơ thể (Mũi - Đuôi/Thân)
        def get_body_vec(parts_dict):
            head = parts_dict.get("nose")
            # Ưu tiên đuôi, nếu ko có thì dùng thân
            tail = parts_dict.get("tail_base")
            if tail is None: tail = parts_dict.get("body_center") # Fallback
            
            if head is not None and tail is not None:
                return head - tail
            return None

        a_vec = get_body_vec(my_parts)
        t_vec = get_body_vec(target_parts)

        # A. Body Cosine: Hai con cùng chiều hay ngược chiều?
        if a_vec is not None and t_vec is not None:
            dot = np.sum(a_vec * t_vec, axis=1)
            mags = np.linalg.norm(a_vec, axis=1) * np.linalg.norm(t_vec, axis=1)
            feats["body_cosine"] = pd.Series(
                np.clip(dot / (mags + 1e-6), -1.0, 1.0), index=idx, dtype="float32"
            )
        else:
            feats["body_cosine"] = zero()

        # B. Gaze Cosine: Tôi có đang nhìn về phía Target không?
        # Vector ánh nhìn = Target_Pos - My_Pos = rel_vec
        if a_vec is not None:
            dot_gaze = np.sum(a_vec * rel_vec, axis=1)
            mag_a = np.linalg.norm(a_vec, axis=1)
            # dist đã tính ở bước 1
            feats["gaze_cosine"] = pd.Series(
                np.clip(dot_gaze / (mag_a * dist + 1e-6), -1.0, 1.0),
                index=idx, dtype="float32"
            )
        else:
            feats["gaze_cosine"] = zero()

        # --- 4. PHÂN RÃ VẬN TỐC (VELOCITY DECOMPOSITION) - CHÌA KHÓA CHO AVOID/ESCAPE ---
        # Vector đơn vị hướng về địch (u)
        dist_safe = dist.copy()
        dist_safe[dist_safe == 0] = 1e-6
        u_vec = rel_vec / dist_safe[:, None]

        # a_vel và t_vel lấy từ Context
        a_vel, t_vel = ctx.vel, target_ctx.vel

        # A. Approach Speed (Vận tốc dọc trục nối 2 con)
        # Dương: Lao vào nhau | Âm: Chạy ra xa nhau
        a_along = np.sum(a_vel * u_vec, axis=1)
        t_along = np.sum(t_vel * (-u_vec), axis=1) # Target hướng ngược lại
        rel_along = np.sum((a_vel - t_vel) * u_vec, axis=1)

        # B. Lateral Speed (Vận tốc ngang - Vuông góc trục nối)
        # Vector chiếu: v_proj = (v . u) * u
        a_proj = a_along[:, None] * u_vec
        a_lat_vec = a_vel - a_proj
        a_lat_speed = np.linalg.norm(a_lat_vec, axis=1)

        feats["approach_speed_agent"]  = pd.Series(a_along, index=idx, dtype="float32")
        feats["approach_speed_target"] = pd.Series(t_along, index=idx, dtype="float32")
        feats["approach_speed_rel"]    = pd.Series(rel_along, index=idx, dtype="float32")
        feats["lateral_speed_agent"]   = pd.Series(a_lat_speed, index=idx, dtype="float32")
        return feats
    
    def _feat_shortburst_social(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        feats = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- Lấy lại vài quantity cơ bản từ pairwise/avoidance ---
        # vector Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        rel_dist = np.linalg.norm(rel_vec, axis=1)
        rel_dist_s = pd.Series(rel_dist, index=idx, dtype="float32")
    
        # unit vector
        rel_dist_safe = np.where(rel_dist == 0, 1e-6, rel_dist)
        u_vec = rel_vec / rel_dist_safe[:, None]
    
        # velocity dọc trục nối (approach speed)
        a_vel = ctx.vel
        t_vel = target_ctx.vel
        a_along = np.sum(a_vel * u_vec, axis=1)                # +: lao vào target
        t_along = np.sum(t_vel * (-u_vec), axis=1)             # +: target lao vào agent
        rel_along = np.sum((a_vel - t_vel) * u_vec, axis=1)    # +: lại gần nhau
    
        a_along_s = pd.Series(a_along, index=idx, dtype="float32")
        t_along_s = pd.Series(t_along, index=idx, dtype="float32")
        rel_along_s = pd.Series(rel_along, index=idx, dtype="float32")
    
        # speed agent / target
        a_speed = ctx.speed_series
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32"
        )
    
        # heading_rel_cos ~ escape / approach
        # vector body của agent
        # (reuse idea từ _feat_pairwise)
        # head ~ nose, tail ~ tail_base/body_center
        parts_a = self._extract_parts_dict(ctx, ["nose", "tail_base"])
        head_a = parts_a.get("nose")
        tail_a = parts_a.get("tail_base")
    
        if head_a is not None and tail_a is not None:
            body_vec_a = head_a - tail_a
            dot = np.sum(body_vec_a * rel_vec, axis=1)
            mag = np.linalg.norm(body_vec_a, axis=1) * rel_dist_safe
            heading_cos = np.clip(dot / (mag + 1e-6), -1.0, 1.0)
            heading_cos_s = pd.Series(heading_cos, index=idx, dtype="float32")
        else:
            heading_cos_s = zero()
    
        # --- Rolling window 10, 20, 30 frames (ở fps gốc) ---
        for w30 in [10, 20, 30]:
            ws = self._scale(w30)
            min_p = max(1, ws // 3)
    
            # Attack-like: approach mạnh, khoảng cách giảm nhanh
            feats[f"sb_att_approach_mean_{w30}"] = a_along_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_att_rel_along_mean_{w30}"] = rel_along_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_att_dist_delta_{w30}"] = (rel_dist_s - rel_dist_s.shift(ws)).fillna(0.0)
    
            # Chase-like: agent & target đều nhanh, dist tương đối nhỏ
            feats[f"sb_chase_speed_agent_mean_{w30}"] = a_speed.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_chase_speed_target_mean_{w30}"] = t_speed.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_chase_dist_mean_{w30}"] = rel_dist_s.rolling(ws, min_periods=min_p).mean()
    
            # Escape-like: heading ngược, dist tăng nhanh
            feats[f"sb_esc_heading_cos_mean_{w30}"] = heading_cos_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_esc_dist_gain_{w30}"] = (rel_dist_s.shift(-ws) - rel_dist_s).fillna(0.0)
    
        # clip & fillna
        for k, v in feats.items():
            feats[k] = v.replace([np.inf, -np.inf], np.nan).fillna(0.0).astype("float32")
    
        return feats


    # --- Methods tương thích ---
    
    def build_pose_tensor(self, tracking: pd.DataFrame):
        """
        Chuyển dữ liệu tracking (DataFrame) sang Tensor [Frames, Mice, 2] và Dict chi tiết.
        """
        tracking = tracking.sort_values("video_frame")
        frames = np.sort(tracking["video_frame"].unique())
        
        pvid = tracking.pivot(
            index="video_frame", 
            columns=["mouse_id", "bodypart"], 
            values=["x", "y"]
        )
        pvid = pvid.reorder_levels([1, 2, 0], axis=1).sort_index(axis=1).astype("float32")
        mouse_ids = list(pvid.columns.get_level_values(0).unique())
        pos = np.full((len(frames), len(mouse_ids), 2), np.nan, dtype=np.float32)
        per_mouse_df = {}
        
        for i, mid in enumerate(mouse_ids):
            single = pvid[mid]
            per_mouse_df[mid] = single
            
            if "body_center" in single.columns.get_level_values(0):
                cx = single["body_center"]["x"]
                cy = single["body_center"]["y"]
            else:
                cx = single.xs("x", level=1, axis=1).mean(axis=1)
                cy = single.xs("y", level=1, axis=1).mean(axis=1)
            
            pos[:, i, 0] = cx.reindex(frames).values
            pos[:, i, 1] = cy.reindex(frames).values
            
        return frames, mouse_ids, pos, per_mouse_df

    def extract_agent_target(
        self, 
        frames: np.ndarray, 
        mouse_ids: List[Any], 
        pos: np.ndarray, 
        agent_id: Any, 
        target_id: Any, 
        per_mouse_df: Dict = None
    ) -> pd.DataFrame:
        """
        Trích xuất đặc trưng cho cặp (Agent, Target).
        """
        try:
            aid_idx = mouse_ids.index(agent_id)
        except ValueError:
            return pd.DataFrame() 

        # 1. Build Agent Context
        ctx_agent = self._build_context(
            frames, 
            pos[:, aid_idx, :], 
            per_mouse_df.get(agent_id) if per_mouse_df else None
        )

        # 2. Build Target Context
        ctx_target = None
        if self.cfg.use_pairwise and target_id is not None and target_id in mouse_ids:
             tid_idx = mouse_ids.index(target_id)
             ctx_target = self._build_context(
                 frames, 
                 pos[:, tid_idx, :], 
                 per_mouse_df.get(target_id) if per_mouse_df else None
             )

        # 3. Run all features
        all_data = {}
        for func_name, func in self.feature_registry.items():
            out_dict = func(ctx_agent, target_ctx=ctx_target)
            all_data.update(out_dict)

        df_out = pd.DataFrame(all_data, index=ctx_agent.idx)
        df_out = df_out.replace([np.inf, -np.inf], np.nan).fillna(0.0)
        
        return df_out.reindex(sorted(df_out.columns), axis=1)

#=====================================================================================
#=====================================================================================
#=====================================================================================


from __future__ import annotations
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import gc
import itertools
import json
import time
from collections import defaultdict

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedGroupKFold

# (Trên Kaggle) dùng metric chính thức
import sys
sys.path.append("/kaggle/usr/lib/mabe-f-beta")
from metric import score   # hàm score(submission_df, dataset_df)

# =========================================================
# 1. ĐƯỜNG DẪN & CẤU HÌNH
# =========================================================

INPUT_DIR = Path("/kaggle/input/MABe-mouse-behavior-detection")
TRAIN_TRACKING_DIR = INPUT_DIR / "train_tracking"
TRAIN_ANNOTATION_DIR = INPUT_DIR / "train_annotation"
TEST_TRACKING_DIR = INPUT_DIR / "test_tracking"


WORKING_DIR = Path("/kaggle/working")
RESULTS_DIR = Path(r"/kaggle/input/results-xgb-fe")
RESULTS_DIR.mkdir(exist_ok=True, parents=True)

INDEX_COLS = ["video_id", "agent_id", "target_id", "video_frame"]

# hành vi “self” vs “pair” giống notebook (có thể chỉnh nếu muốn)
SELF_BEHAVIORS = [
    "biteobject", "climb", "dig", "exploreobject", "freeze",
    "genitalgroom", "huddle", "rear", "rest", "run", "selfgroom",
]
PAIR_BEHAVIORS = [
    "allogroom", "approach", "attack", "attemptmount", "avoid",
    "chase", "chaseattack", "defend", "disengage", "dominance",
    "dominancegroom", "dominancemount", "ejaculate", "escape",
    "flinch", "follow", "intromit", "mount", "reciprocalsniff",
    "shepherd", "sniff", "sniffbody", "sniffface", "sniffgenital",
    "submit", "tussle",
]


# =========================================================
# 2. ĐỌC METADATA & HELPER
# =========================================================

def load_metadata() -> pd.DataFrame:
    train_meta = pd.read_csv(INPUT_DIR / "train.csv")
    return train_meta


def get_video_params(video_id: Any, meta: pd.DataFrame) -> Tuple[float, float]:
    """Lấy fps, pix_per_cm cho video từ train.csv."""
    row = meta.loc[meta["video_id"] == video_id]
    if row.empty:
        raise KeyError(f"video_id={video_id} không có trong train.csv")
    row = row.iloc[0]

    # giống notebook: cột "frames per second" & "pix per cm (approx)"
    fps = float(row["frames_per_second"])
    pix_per_cm = float(row["pix_per_cm_approx"])
    if not np.isfinite(pix_per_cm) or pix_per_cm <= 0:
        pix_per_cm = 1.0
    return fps, pix_per_cm


def load_tracking(lab_id: str, video_id: Any) -> pd.DataFrame:
    """Đọc tracking parquet → pandas (schema: video_frame, mouse_id, bodypart, x, y)."""
    path = TRAIN_TRACKING_DIR / str(lab_id) / f"{video_id}.parquet"
    if not path.exists():
        raise FileNotFoundError(path)
    df = pd.read_parquet(path)
    return df

def load_tracking_test(lab_id: str, video_id: Any) -> pd.DataFrame:
    """Đọc tracking parquet của test → pandas."""
    path = INPUT_DIR / "test_tracking" / str(lab_id) / f"{video_id}.parquet"
    if not path.exists():
        raise FileNotFoundError(path)
    return pd.read_parquet(path)


def load_annotation(lab_id: str, video_id: Any) -> pd.DataFrame:
    """Đọc annotation (agent_id, target_id, action, start_frame, stop_frame)."""
    path = TRAIN_ANNOTATION_DIR / str(lab_id) / f"{video_id}.parquet"
    if not path.exists():
        # không có label cho video này
        return pd.DataFrame(
            columns=["agent_id", "target_id", "action", "start_frame", "stop_frame"]
        )
    ann = pd.read_parquet(path)
    return ann[["agent_id", "target_id", "action", "start_frame", "stop_frame"]]


# =========================================================
# 3. TÍNH FEATURE PER-FRAME BẰNG FEATUREEXTRACTOR
# =========================================================

# Cache: (lab, video, agent, target) -> (frames, feature_df)
_feature_cache: Dict[Tuple[str, int, int, int], Tuple[np.ndarray, pd.DataFrame]] = {}


def get_frame_features_for_pair(
    lab_id: str,
    video_id: int,
    agent_id: int,
    target_id: int,
    meta: pd.DataFrame,
) -> Tuple[np.ndarray, pd.DataFrame]:
    """
    Tính (hoặc lấy cache) feature per-frame cho 1 video + (agent, target).
    Trả về: frames [F], features_df [F, D]
    """
    key = (str(lab_id), int(video_id), int(agent_id), int(target_id))
    if key in _feature_cache:
        return _feature_cache[key]

    fps, pix_per_cm = get_video_params(video_id, meta)
    tracking = load_tracking(lab_id, video_id)

    fe = FeatureExtractor(
        fps=fps,
        pix_per_cm=pix_per_cm,
        smooth_sigma=1.0,
        use_pairwise=True,
    )

    frames, mouse_ids, pos, per_mouse_df = fe.build_pose_tensor(tracking)

    # agent/target có thể là cùng chuột (self) hoặc khác chuột (pair)
    features_df: pd.DataFrame = fe.extract_agent_target(
        frames=frames,
        mouse_ids=mouse_ids,
        pos=pos,
        agent_id=agent_id,
        target_id=target_id,
        per_mouse_df=per_mouse_df,
    )
    # index chính là frame
    features_df.index = frames

    _feature_cache[key] = (frames, features_df)
    return frames, features_df

_feature_cache: Dict[Tuple[str, int, Any, Any], Tuple[np.ndarray, pd.DataFrame]] = {}

def get_frame_features_for_pair_test(
    lab_id: str,
    video_id: int,
    agent_id: Any,
    target_id: Any,
    test_meta: pd.DataFrame,
) -> Tuple[np.ndarray, pd.DataFrame]:
    """
    Feature per-frame cho test (video_id, agent, target).
    Trả về: frames [F], features_df [F, D]
    """
    key = (f"test_{lab_id}", int(video_id), agent_id, target_id)
    if key in _feature_cache:
        return _feature_cache[key]

    # Lấy fps, pix_per_cm_approx từ test.csv
    row = test_meta[test_meta["video_id"] == video_id].iloc[0]
    fps = float(row["frames_per_second"])
    pix_per_cm = float(row["pix_per_cm_approx"])
    if not np.isfinite(pix_per_cm) or pix_per_cm <= 0:
        pix_per_cm = 1.0

    tracking = load_tracking_test(lab_id, video_id)

    fe = FeatureExtractor(
        fps=fps,
        pix_per_cm=pix_per_cm,
        smooth_sigma=1.0,
        use_pairwise=True,
    )

    frames, mouse_ids, pos, per_mouse_df = fe.build_pose_tensor(tracking)

    features_df = fe.extract_agent_target(
        frames=frames,
        mouse_ids=mouse_ids,
        pos=pos,
        agent_id=agent_id,
        target_id=target_id,
        per_mouse_df=per_mouse_df,
    )
    features_df.index = frames

    _feature_cache[key] = (frames, features_df)
    return frames, features_df



# =========================================================
# 4. BUILD FRAME-LEVEL DATASET CHO 1 (lab_id, behavior)
# =========================================================

def build_frame_dataset_for_lab_behavior(
    lab_id: str,
    behavior: str,
    train_meta: pd.DataFrame,
    mode: str = "self",
) -> Tuple[pd.DataFrame, pd.DataFrame, np.ndarray]:
    """
    Xây tập frame-level (indices, features, labels) cho 1 (lab, behavior).

    indices: DataFrame với cột INDEX_COLS
    features: DataFrame per-frame features
    labels: np.ndarray nhị phân (0/1)
    """

    videos = (
        train_meta[train_meta["lab_id"] == lab_id]["video_id"]
        .unique()
        .tolist()
    )

    index_list = []
    feature_list = []
    label_list = []

    for video_id in videos:
        ann = load_annotation(lab_id, video_id)
        if ann.empty:
            continue

        # chỉ lấy annotation của behavior này
        ann_bhv = ann[ann["action"] == behavior]
        if ann_bhv.empty:
            continue

        # các (agent, target) cần xem
        pairs = ann_bhv[["agent_id", "target_id"]].drop_duplicates().values.tolist()

        for (agent_id, target_id) in pairs:
            if mode == "self":
                target_id_use = agent_id
            else:
                target_id_use = target_id

            frames, feat_df = get_frame_features_for_pair(
                lab_id=lab_id,
                video_id=video_id,
                agent_id=agent_id,
                target_id=target_id_use,
                meta=train_meta,
            )

            # label per-frame: frame ∈ bất kỳ [start, stop) của (agent,target,behavior)
            ann_pair = ann_bhv[
                (ann_bhv["agent_id"] == agent_id)
                & (ann_bhv["target_id"] == target_id)
            ]
            if ann_pair.empty and mode == "self":
                ann_pair = ann_bhv[ann_bhv["agent_id"] == agent_id]

            pos_frames = set()
            for _, r in ann_pair.iterrows():
                pos_frames.update(range(int(r["start_frame"]), int(r["stop_frame"])))

            if len(pos_frames) == 0:
                continue

            label = np.isin(frames, list(pos_frames)).astype("int8")
            if label.sum() == 0:
                continue

            idx_df = pd.DataFrame(
                {
                    "video_id": video_id,
                    "agent_id": agent_id,
                    "target_id": target_id,
                    "video_frame": frames,
                }
            )

            index_list.append(idx_df)
            feature_list.append(feat_df.reset_index(drop=True))
            label_list.append(label)

    if not index_list:
        return (
            pd.DataFrame(columns=INDEX_COLS),
            pd.DataFrame(),
            np.zeros(0, dtype="int8"),
        )

    indices = pd.concat(index_list, ignore_index=True)
    features = pd.concat(feature_list, ignore_index=True)
    labels = np.concatenate(label_list).astype("int8")

    assert len(indices) == len(features) == len(labels)

    return indices, features, labels


# =========================================================
# 5. TRAIN + OOF CHO 1 (lab_id, behavior)
# =========================================================

def tune_threshold(oof_pred: np.ndarray, y: np.ndarray) -> float:
    ths = np.arange(0.0, 1.005, 0.005)
    scores = [f1_score(y, (oof_pred >= th), zero_division=0) for th in ths]
    return float(ths[int(np.argmax(scores))])

#
def train_validate_one(
    lab_id: str,
    behavior: str,
    indices: pd.DataFrame,
    features: pd.DataFrame,
    labels: np.ndarray,
) -> float:
    """
    Train XGBoost binary cho 1 (lab, behavior) + lưu OOF prediction.
    Trả về: F1 trên toàn bộ OOF (frame-level).
    """
    result_dir = RESULTS_DIR / lab_id / behavior
    result_dir.mkdir(parents=True, exist_ok=True)

    n = len(labels)

    if n == 0 or labels.sum() == 0:
        oof_df = indices.copy()
        oof_df["fold"] = -1
        oof_df["prediction"] = 0.0
        oof_df["predicted_label"] = 0
        oof_df.to_parquet(result_dir / "oof_predictions.parquet", index=False)
        (result_dir / "f1.txt").write_text("0.0\n")
        return 0.0

    X = features.values.astype("float32")
    y = labels.astype("int8")
    groups = indices["video_id"].values

    folds = np.ones(n, dtype="int8") * -1
    oof_pred = np.zeros(n, dtype="float32")
    oof_label = np.zeros(n, dtype="int8")

    cv = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=42)

    for fold, (tr_idx, va_idx) in enumerate(cv.split(X, y, groups=groups)):
        fold_dir = result_dir / f"fold_{fold}"
        fold_dir.mkdir(parents=True, exist_ok=True)

        X_tr, y_tr = X[tr_idx], y[tr_idx]
        X_va, y_va = X[va_idx], y[va_idx]

        # scale_pos_weight
        pos = y_tr.sum()
        neg = len(y_tr) - pos
        scale_pos_weight = float(neg / pos) if pos > 0 else 1.0

        params = {
            "objective": "binary:logistic",
            "eval_metric": "logloss",
            "device": "cuda",
            "tree_method": "hist",
            "learning_rate": 0.05,
            "max_depth": 6,
            "min_child_weight": 5,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "scale_pos_weight": scale_pos_weight,
            "max_bin": 64,
            "seed": 42,
        }

        dtrain = xgb.QuantileDMatrix(
            X_tr,
            label=y_tr,
            feature_names=features.columns.tolist(),
            max_bin=64,
        )
        dvalid = xgb.DMatrix(
            X_va,
            label=y_va,
            feature_names=features.columns.tolist(),
        )

        evals_result: Dict[str, Dict[str, List[float]]] = {}

        early_stop = xgb.callback.EarlyStopping(
            rounds=10, metric_name="logloss", data_name="valid", maximize=False
        )

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=250,
            evals=[(dtrain, "train"), (dvalid, "valid")],
            callbacks=[early_stop],
            evals_result=evals_result,
            verbose_eval=False,
        )

        pred_va = model.predict(dvalid)
        th = tune_threshold(pred_va, y_va)

        folds[va_idx] = fold
        oof_pred[va_idx] = pred_va
        oof_label[va_idx] = (pred_va >= th).astype("int8")

        model.save_model(fold_dir / "model.json")
        with open(fold_dir / "threshold.txt", "w") as f:
            f.write(f"{th}\n")

    # lưu OOF
    oof_df = indices.copy()
    oof_df["fold"] = folds
    oof_df["prediction"] = oof_pred
    oof_df["predicted_label"] = oof_label
    oof_df.to_parquet(result_dir / "oof_predictions.parquet", index=False)

    f1 = f1_score(y, oof_label, zero_division=0)
    (result_dir / "f1.txt").write_text(f"{f1:.6f}\n")
    return float(f1)

def load_models_for_behavior_infer(lab_id: str, behavior: str):
    """
    Đọc các fold model + threshold cho (lab, behavior) từ RESULTS_DIR.
    Dùng cho inference (test).
    """
    base_dir = RESULTS_DIR / lab_id / behavior
    if not base_dir.exists():
        return []

    models = []
    for fold_dir in sorted(base_dir.glob("fold_*")):
        model_file = fold_dir / "model.json"
        thr_file = fold_dir / "threshold.txt"
        if not model_file.exists():
            continue

        booster = xgb.Booster()
        booster.load_model(str(model_file))

        if thr_file.exists():
            thr = float(thr_file.read_text().strip())
        else:
            thr = 0.5

        models.append((booster, thr))

    return models


# =========================================================
# 6. LOOP QUA TẤT CẢ BEHAVIORS TRONG 1 LAB
#    (train_all_labs_behaviors vẫn giữ nguyên, nhưng main
#     sẽ filter train_meta chỉ còn 1 lab)
# =========================================================

def train_all_labs_behaviors(train_meta: pd.DataFrame):
    """
    Loop qua từng lab trong train_meta (ở đây main đã filter chỉ còn 1 lab):
      - đọc annotation của tất cả video
      - lấy unique action xuất hiện trong lab đó
      - train 1 model/frame-level cho từng (lab, action)
    """
    labs = train_meta["lab_id"].unique().tolist()

    start_time = time.perf_counter()

    for lab_id in labs:
        # tập video của lab này
        videos = train_meta[train_meta["lab_id"] == lab_id]["video_id"].unique().tolist()

        # gom toàn bộ action thực sự có trong annotation của lab này
        behaviors_set = set()
        for vid in videos:
            ann = load_annotation(lab_id, vid)
            if ann.empty:
                continue
            behaviors_set.update(ann["action"].unique().tolist())

        behaviors = sorted(behaviors_set)
        print(f"\n===== LAB {lab_id}: {len(behaviors)} behaviors =====")

        for behavior in behaviors:
            # if behavior != "submit": continue

            mode = "self" if behavior in SELF_BEHAVIORS else "pair"

            print(f"\n=== LAB={lab_id} | behavior={behavior} | mode={mode} ===")
            indices, features, labels = build_frame_dataset_for_lab_behavior(
                lab_id=str(lab_id),
                behavior=behavior,
                train_meta=train_meta,
                mode=mode,
            )
            print(
                f"frames: {len(labels):,}, positives: {labels.sum():,}, features: "
                f"{features.shape[1] if not features.empty else 0}"
            )

            if len(labels) == 0:
                print(" -> skip (no samples)")
                continue

            f1 = train_validate_one(str(lab_id), behavior, indices, features, labels)
            elapsed = time.perf_counter() - start_time
            print(f" -> OOF F1 (frame-level): {f1:.3f} | elapsed={elapsed/60:.1f} min")



# =========================================================
# 7. GOM OOF PREDICTION → SEGMENT & TÍNH SCORE()
# =========================================================

def build_oof_submission_from_parquet(
    target_lab_id: Optional[str] = None,
) -> pd.DataFrame:
    """
    Đọc tất cả oof_predictions.parquet trong RESULTS_DIR,
    gom thành frame-level table rồi nối thành segment-level prediction
    giống inference notebook (simplified).

    Nếu target_lab_id != None thì chỉ lấy OOF của lab đó
    (vd "AdaptableSnail").
    """
    oof_files = list(RESULTS_DIR.glob("*/**/oof_predictions.parquet"))
    if not oof_files:
        raise RuntimeError("Không tìm thấy OOF parquet, hãy train trước.")

    frame_preds = []

    for path in oof_files:
        # path: results_xgb_fe/lab/behavior/oof_predictions.parquet
        parts = path.parts
        behavior = parts[-2]
        lab_id = parts[-3]

        # chỉ lấy file thuộc lab mong muốn (nếu có)
        if target_lab_id is not None and lab_id != target_lab_id:
            continue

        df = pd.read_parquet(path)
        df = df[INDEX_COLS + ["prediction"]].copy()
        df["lab_id"] = lab_id
        df["action"] = behavior
        frame_preds.append(df)

    if not frame_preds:
        raise RuntimeError(
            f"Không có OOF predictions nào cho lab_id={target_lab_id}"
        )

    frame_df = pd.concat(frame_preds, ignore_index=True)

    # sắp xếp
    frame_df = frame_df.sort_values(
        ["lab_id", "video_id", "agent_id", "target_id", "action", "video_frame"]
    ).reset_index(drop=True)

    # Convert frame-level prob -> hard label + segments
    segments = []
    for (lab_id, video_id, agent_id, target_id, action), group in frame_df.groupby(
        ["lab_id", "video_id", "agent_id", "target_id", "action"], sort=False
    ):
        frames = group["video_frame"].values
        scores = group["prediction"].values

        # dùng một threshold fix (vd 0.5) cho demo
        # (hoặc bạn có thể lưu threshold per (lab,behavior) và apply)
        hard = scores >= 0.5

        in_seg = False
        start = None
        prev_f = None

        for f, h in zip(frames, hard):
            if h and not in_seg:
                in_seg = True
                start = int(f)
            elif (not h) and in_seg:
                stop = int(prev_f + 1)  # [start, stop)
                segments.append(
                    {
                        "lab_id": lab_id,
                        "video_id": int(video_id),
                        "agent_id": int(agent_id),
                        "target_id": int(target_id),
                        "action": action,
                        "start_frame": start,
                        "stop_frame": stop,
                    }
                )
                in_seg = False
            prev_f = f

        if in_seg:
            stop = int(frames[-1] + 1)
            segments.append(
                {
                    "lab_id": lab_id,
                    "video_id": int(video_id),
                    "agent_id": int(agent_id),
                    "target_id": int(target_id),
                    "action": action,
                    "start_frame": start,
                    "stop_frame": stop,
                }
            )

    if not segments:
        return pd.DataFrame(
            columns=[
                "lab_id",
                "video_id",
                "agent_id",
                "target_id",
                "action",
                "start_frame",
                "stop_frame",
            ]
        )

    submission = pd.DataFrame(segments)
    submission = submission.sort_values(
        ["lab_id", "video_id", "agent_id", "target_id", "action", "start_frame"]
    ).reset_index(drop=True)

    return submission

BAD_VIDEOS = [143861384, 1596473327, 1212811043, 878123481]

def compute_validation_score(
    submission: pd.DataFrame,
    lab_id: Optional[str] = None,
) -> float:
    """
    Gọi metric `score()` chính thức trên train set.
    Nếu lab_id != None → chỉ validate trên lab đó.
    """
    # ===== THAY ĐỔI Ở ĐÂY =====
    # Không dùng train.csv, mà phải đọc toàn bộ annotations
    train_meta = pd.read_csv(INPUT_DIR / "train.csv")
    
    if lab_id is not None:
        train_meta = train_meta[train_meta["lab_id"] == lab_id].reset_index(drop=True)

    if BAD_VIDEOS:
        train_meta = train_meta[~train_meta["video_id"].isin(BAD_VIDEOS)]
    
    # Đọc tất cả annotation files
    all_annotations = []
    for _, row in train_meta.iterrows():
        lab = row["lab_id"]
        vid = row["video_id"]
        ann = load_annotation(lab, vid)
        if not ann.empty:
            ann["lab_id"] = lab
            ann["video_id"] = vid
            ann["behaviors_labeled"] = row["behaviors_labeled"]
            all_annotations.append(ann)
    
    if not all_annotations:
        print("Không có annotation nào để validate!")
        return 0.0
    
    dataset = pd.concat(all_annotations, ignore_index=True)
    
    # Filter submission theo lab nếu cần
    if lab_id is not None:
        submission = submission[submission["lab_id"] == lab_id].reset_index(drop=True)
    
    # ===== GỌI METRIC =====
    s = score(dataset, submission, row_id_column_name="row_id")

    print(
        f"Official validation score"
        f"{' (lab=' + lab_id + ')' if lab_id is not None else ''}: {s:.6f}"
    )
    return float(s)



# =========================================================
# 8. MAIN
# =========================================================
def str_to_mouse_id(s: str) -> int:
    if s == "self":
        return -1
    return int(str(s).replace("mouse", ""))


def predict_behaviors_for_pair(
    lab_id: str,
    video_id: int,
    agent_internal_id: Any,
    target_internal_id: Any,
    behaviors: List[str],
    test_meta: pd.DataFrame,
) -> pd.DataFrame:
    """
    Chạy inference cho 1 cặp (video, agent_internal_id, target_internal_id)
    với list behaviors (cùng mode: all self hoặc all pair).
    Trả về segment-level DataFrame: video_id, action, start_frame, stop_frame.
    """
    if lab_id != "SparklingTapir": return None
    frames, feat_df = get_frame_features_for_pair_test(
        lab_id=lab_id,
        video_id=video_id,
        agent_id=agent_internal_id,
        target_id=target_internal_id,
        test_meta=test_meta,
    )
    if feat_df.empty:
        return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])

    feat_df = feat_df.astype("float32")
    n_frames = len(feat_df)

    scores_per_behavior = {}
    for behavior in behaviors:
        models = load_models_for_behavior_infer(lab_id, behavior)
        if not models:
            continue

        req_feats = models[0][0].feature_names
        # Build X_test với đúng bộ feature của model
        X_test = pd.DataFrame(
            0.0,
            index=feat_df.index,
            columns=req_feats,
            dtype=np.float32,
        )
        common = list(set(req_feats) & set(feat_df.columns))
        if common:
            X_test[common] = feat_df[common]

        dtest = xgb.DMatrix(X_test, feature_names=req_feats)

        agg_scores = np.zeros(n_frames, dtype=np.float32)
        for booster, thr in models:
            probs = booster.predict(dtest)
            labels = (probs >= thr).astype(np.int8)
            agg_scores += probs * labels

        agg_scores /= max(len(models), 1)
        scores_per_behavior[behavior] = agg_scores

        del dtest, X_test
        gc.collect()

    if not scores_per_behavior:
        return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])

    beh_list = list(scores_per_behavior.keys())
    score_mat = np.vstack([scores_per_behavior[b] for b in beh_list]).T  # [F, B]

    max_idx = score_mat.argmax(axis=1)
    max_scores = score_mat.max(axis=1)
    labels = np.where(max_scores == 0.0, "none", np.array(beh_list)[max_idx])

    # frame-level → segment
    segments = []
    prev_lab = "none"
    prev_start = None
    prev_f = None

    for f, lab in zip(frames, labels):
        if lab != prev_lab:
            if prev_lab != "none":
                segments.append(
                    {
                        "video_id": int(video_id),
                        "action": prev_lab,
                        "start_frame": int(prev_start),
                        "stop_frame": int(prev_f + 1),
                    }
                )
            prev_lab = lab
            prev_start = f
        prev_f = f

    if prev_lab != "none":
        segments.append(
            {
                "video_id": int(video_id),
                "action": prev_lab,
                "start_frame": int(prev_start),
                "stop_frame": int(prev_f + 1),
            }
        )

    if not segments:
        return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])

    return pd.DataFrame(segments)



target_lab = "SparklingTapir"
print(f"Đọc test.csv cho lab {target_lab} ...")
test_meta = pd.read_csv(INPUT_DIR / "test.csv")
test_meta = test_meta[test_meta["lab_id"] == target_lab].reset_index(drop=True)
video_size_meta = test_meta[["video_id", "video_width_pix", "video_height_pix"]].copy()


# Lấy danh sách behavior đã train (thư mục con trong RESULTS_DIR/AdaptableSnail)
lab_result_dir = RESULTS_DIR / target_lab
if lab_result_dir.exists():
    trained_behaviors = sorted(
        [p.name for p in lab_result_dir.iterdir() if p.is_dir()]
    )
else:
    trained_behaviors = []

self_behaviors_in_lab = [b for b in trained_behaviors if b in SELF_BEHAVIORS]
pair_behaviors_in_lab = [b for b in trained_behaviors if b in PAIR_BEHAVIORS]

print("Behaviors (self) dùng để predict:", self_behaviors_in_lab)
print("Behaviors (pair) dùng để predict:", pair_behaviors_in_lab)

all_segments = []

# Loop từng video test của lab
for video_id in sorted(test_meta["video_id"].unique()):
    print(f"Predict video_id={video_id} ...")

    tracking = load_tracking_test(target_lab, video_id)
    mouse_ids_internal = sorted(tracking["mouse_id"].unique().tolist())

    # Map internal mouse_id -> string để đưa vào submission
    def to_submit_id(mid):
        s = str(mid)
        return s if s.startswith("mouse") else f"mouse{s}"

    # SELF behaviors: agent == target (self)
    if self_behaviors_in_lab:
        for mid in mouse_ids_internal:
            seg_df = predict_behaviors_for_pair(
                lab_id=target_lab,
                video_id=video_id,
                agent_internal_id=mid,
                target_internal_id=mid,  # self
                behaviors=self_behaviors_in_lab,
                test_meta=test_meta,
            )
            if not seg_df.empty:
                seg_df["agent_id"] = to_submit_id(mid)
                seg_df["target_id"] = "self"
                all_segments.append(seg_df)

    # PAIR behaviors: mọi cặp agent != target
    if pair_behaviors_in_lab and len(mouse_ids_internal) > 1:
        for agent_internal, target_internal in itertools.permutations(
            mouse_ids_internal, 2
        ):
            seg_df = predict_behaviors_for_pair(
                lab_id=target_lab,
                video_id=video_id,
                agent_internal_id=agent_internal,
                target_internal_id=target_internal,
                behaviors=pair_behaviors_in_lab,
                test_meta=test_meta,
            )
            if not seg_df.empty:
                seg_df["agent_id"] = to_submit_id(agent_internal)
                seg_df["target_id"] = to_submit_id(target_internal)
                all_segments.append(seg_df)

# Gộp tất cả segments → submission.csv
# Gộp tất cả segments → submission2.csv
if all_segments:
    submission7 = pd.concat(all_segments, ignore_index=True)
    submission7 = submission7[
        ["video_id", "agent_id", "target_id", "action", "start_frame", "stop_frame"]
    ]
    submission7 = submission7.sort_values(
        ["video_id", "agent_id", "target_id", "action", "start_frame"]
    ).reset_index(drop=True)
else:
    # DataFrame rỗng, KHÔNG dummy row
    submission7 = pd.DataFrame(
        columns=[
            "video_id",
            "agent_id",
            "target_id",
            "action",
            "start_frame",
            "stop_frame",
        ]
    )

# Thêm row_id (kể cả khi rỗng)
submission7.insert(0, "row_id", np.arange(len(submission7), dtype=np.int64))

sub_path = WORKING_DIR / "submission7.csv"
submission7.to_csv(sub_path, index=False)
print(f"Saved SparklingTapir submission to {sub_path}")



Đọc test.csv cho lab SparklingTapir ...
Behaviors (self) dùng để predict: []
Behaviors (pair) dùng để predict: ['attack', 'defend', 'escape', 'mount']
Saved SparklingTapir submission to /kaggle/working/submission7.csv


# TranquilPanther

In [14]:
from pathlib import Path
import shutil
import gc

WORKING_DIR = Path("/kaggle/working")

# 1) Xóa mọi thứ trong /kaggle/working trừ .csv
for path in WORKING_DIR.iterdir():
    # giữ lại file .csv
    if path.is_file() and path.suffix == ".csv":
        continue

    if path.is_file():
        try:
            path.unlink()
        except Exception as e:
            print(f"Cannot remove file {path}: {e}")
    elif path.is_dir():
        try:
            shutil.rmtree(path, ignore_errors=True)
        except Exception as e:
            print(f"Cannot remove dir {path}: {e}")


gc.collect()

76

In [15]:
from __future__ import annotations
from typing import Dict, List, Tuple, Any, Optional
import warnings
from dataclasses import dataclass, field
import numpy as np
import pandas as pd
from scipy.ndimage import gaussian_filter1d
from tqdm import tqdm

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
np.seterr(invalid="ignore", divide="ignore")

# =============================================================================
# 1. CONFIGURATION
# =============================================================================
@dataclass
class FeatureConfig:
    """
    Chứa cấu hình tham số (Hyperparameters).
    """
    fps: float = 30.0
    pix_per_cm: float = 1.0
    smooth_sigma: float = 1.0
    use_pairwise: bool = True


# =============================================================================
# 2. AGENT CONTEXT
# =============================================================================
@dataclass
class AgentContext:
    """
    Container chứa dữ liệu đã tiền xử lý của một con chuột.
    Giúp tránh việc tính toán lại vận tốc/gia tốc nhiều lần.
    """
    idx: pd.Index          # Index frame
    pos: np.ndarray        # [F, 2] cm
    vel: np.ndarray        # [F, 2] cm/s
    speed: np.ndarray      # [F, 1] cm/s
    acc: np.ndarray        # [F, 2] cm/s^2
    
    cx: pd.Series          # Series tọa độ X (để dùng rolling)
    cy: pd.Series          # Series tọa độ Y
    speed_series: pd.Series # Series tốc độ
    
    raw_df: Optional[pd.DataFrame] = None # Dữ liệu gốc các bộ phận 


# =============================================================================
# 3. FEATURE EXTRACTOR
# =============================================================================
class FeatureExtractor:
    """
    Class trích xuất đặc trưng hành vi từ dữ liệu tracking.
    """
    def __init__(self, fps: float, pix_per_cm: float, smooth_sigma: float = 1.0, use_pairwise: bool = True):
        # Map tham số từ init vào Config
        self.cfg = FeatureConfig(
            fps=float(fps), 
            pix_per_cm=float(pix_per_cm), 
            smooth_sigma=smooth_sigma,
            use_pairwise=use_pairwise
        )
        
        # Đăng ký các hàm feature sẽ chạy
        self.feature_registry = {
            "kinematics": self._feat_basic_kinematics,
            "multiscale": self._feat_multiscale,
            "long_range": self._feat_long_range,
            "cumulative": self._feat_cumulative,
            "curvature": self._feat_curvature,
            "speed_asym": self._feat_speed_asym,
            "gauss_shift": self._feat_gauss_shift,
            "pose_shape": self._feat_pose_shape,
            "pairwise": self._feat_pairwise,
            "follow": self._feat_follow_pattern,
            "short": self._feat_shortburst_social,
            "a": self._feat_attack_sniff,
            "b": self._feat_climb,
            "c": self._feat_ejaculate_temporal
        }

    # --- Helpers ---
    def _scale(self, n_frames_30fps: int) -> int:
        """Quy đổi số frame từ chuẩn 30fps sang fps thực tế của video."""
        return max(1, int(round(n_frames_30fps * self.cfg.fps / 30.0)))

    def _to_cm(self, arr):
        """Chuyển pixel -> cm."""
        return arr / self.cfg.pix_per_cm

    def _smooth(self, x):
        """Làm mượt dữ liệu bằng Gaussian filter."""
        if self.cfg.smooth_sigma is None or x.shape[0] < 3: return x
        if np.all(np.isnan(x)): return x
        return gaussian_filter1d(x, sigma=self.cfg.smooth_sigma, axis=0, mode="nearest")

    def _forward_fill_nan(self, pos):
        """
        Điền dữ liệu thiếu (NaN) bằng giá trị hợp lệ trước đó (Forward Fill).
        """
        if np.all(np.isnan(pos)):
            return np.zeros_like(pos)

        pos_ffill = pos.copy()
        mask = np.any(~np.isnan(pos_ffill), axis=1)
        if not mask.any():
            return np.zeros_like(pos_ffill)

        valid_idx = np.where(mask)[0]
        first, last = valid_idx[0], valid_idx[-1]
        pos_ffill[:first] = pos_ffill[first]
        pos_ffill[last + 1:] = pos_ffill[last]
        df_temp = pd.DataFrame(pos_ffill)
        df_temp = df_temp.ffill()
        return df_temp.to_numpy()
    
    def _speed_series(self, cx: pd.Series, cy: pd.Series) -> pd.Series:
        dx = cx.diff()
        dy = cy.diff()
        v = np.hypot(dx, dy).fillna(0.0) * self.cfg.fps
        return v.astype("float32")
    
    def _roll_future_mean(self, s: pd.Series, w: int, min_p: int = 1) -> pd.Series:
        return s.iloc[::-1].rolling(w, min_periods=min_p).mean().iloc[::-1]

    def _roll_future_var(self, s: pd.Series, w: int, min_p: int = 2) -> pd.Series:
        return s.iloc[::-1].rolling(w, min_periods=min_p).var().iloc[::-1]

    # --- Core Logic ---
    def _compute_kinematics(self, pos_px: np.ndarray):
        """
        Tính toán vật lý cơ bản: Pos(cm), Vel, Speed, Acc.
        Input: Array [Frames, 2] (pixel).
        Output: Tuple (pos_cm, vel, speed, acc).
        """
        pos_ffill = self._forward_fill_nan(pos_px)
        pos_cm = self._to_cm(pos_ffill.astype(np.float32))
        pos_cm = self._smooth(pos_cm)                                               # [F, 2]

        dt = 1.0 / self.cfg.fps
        vel = np.zeros_like(pos_cm, dtype=np.float32)
        vel[1:] = (pos_cm[1:] - pos_cm[:-1]) / dt                                   # [F, 2: (vx, vy)]
        speed = np.linalg.norm(vel, axis=1, keepdims=True).astype(np.float32)       # [F, 1]

        acc = np.zeros_like(pos_cm, dtype=np.float32)                          
        acc[1:] = (vel[1:] - vel[:-1]) / dt                                         # [F, 2:(ax, ay)]
        return pos_cm.astype(np.float32), vel, speed, acc

    def _build_context(self, frames, pos_px, mouse_df=None) -> AgentContext:
        """
        Tạo AgentContext chứa đầy đủ thông tin vật lý của 1 con chuột.
        """
        p, v, s, a = self._compute_kinematics(pos_px)
        idx = pd.Index(frames, name="frame")
        
        return AgentContext(
            idx=idx, pos=p, vel=v, speed=s, acc=a, 
            cx=pd.Series(p[:, 0], index=idx), 
            cy=pd.Series(p[:, 1], index=idx), 
            speed_series=pd.Series(s[:, 0], index=idx), 
            raw_df=mouse_df
        )

    # --- Feature Modules ---
    def _feat_basic_kinematics(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Lấy các giá trị thô: tọa độ x, y, vận tốc vx, vy, tốc độ, gia tốc ax, ay.
        """
        return {
            "a_x": ctx.pos[:, 0], "a_y": ctx.pos[:, 1],
            "a_vx": ctx.vel[:, 0], "a_vy": ctx.vel[:, 1],
            "a_speed": ctx.speed[:, 0],
            "a_ax": ctx.acc[:, 0], "a_ay": ctx.acc[:, 1]
        }

    def _feat_multiscale(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Tính tốc độ trung bình (Mean) và độ lệch chuẩn (Std) ở đa mức thời gian.
        Feature 'sp_ratio' đo độ bùng nổ (Burstiness).
        """
        feats = {}
        speed = ctx.speed_series
        frame_scales = [10, 40, 160]
        for scale in frame_scales:
            ws = self._scale(scale)
            if len(speed) >= ws:
                roller = speed.rolling(ws, min_periods=max(1, ws//4), center=True)
                feats[f"sp_m{scale}"] = roller.mean().astype("float32")
                feats[f"sp_s{scale}"] = roller.std().astype("float32")
        feats[f"sp_ratio"] = feats["sp_m10"] / (feats["sp_m160"] + 1e-6)
        return feats 
        
    def _feat_long_range(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Đặc trưng ngữ cảnh dài hạn:
        - x_ml, y_ml: Vị trí trung bình trong quá khứ.
        - sp_pct: Xếp hạng (percentile) của tốc độ hiện tại so với quá khứ.
        """
        feats: Dict[str, pd.Series] = {}
        speed = ctx.speed_series

        for window in [120, 240]:
            ws = self._scale(window)
            if len(ctx.cx) >= ws:
                feats[f"x_ml{window}"] = ctx.cx.rolling(ws, min_periods=max(5, ws // 6), center=True).mean()
                feats[f"y_ml{window}"] = ctx.cy.rolling(ws, min_periods=max(5, ws // 6), center=True).mean()

        for span in [60, 120]:
            s = self._scale(span)
            feats[f"x_e{span}"] = ctx.cx.ewm(span=s, min_periods=1).mean()
            feats[f"y_e{span}"] = ctx.cy.ewm(span=s, min_periods=1).mean()

        for window in [60, 120]:
            ws = self._scale(window)
            if len(speed) >= ws:
                feats[f"sp_pct{window}"] = speed.rolling(
                    ws, min_periods=max(5, ws // 6), center=True
                ).rank(pct=True)
        return feats
    

    def _feat_curvature(self, ctx: AgentContext, **kwargs) -> Dict:
        feats = {}

        vel_x, vel_y = ctx.vel[:, 0], ctx.vel[:, 1]
        acc_x, acc_y = ctx.acc[:, 0], ctx.acc[:, 1]
        cross_prod = vel_x * acc_y - vel_y * acc_x
        vel_mag = np.sqrt(vel_x**2 + vel_y**2)
        moving_mask = vel_mag > 2.0
        vel_mag_safe = np.maximum(vel_mag, 0.1 / self.cfg.fps)
        raw_curv = cross_prod / (vel_mag_safe**3)
        raw_curv = np.where(moving_mask, raw_curv, 0.0)
        min_turn_radius_cm = 0.5
        max_k = 1.0 / min_turn_radius_cm
        raw_curv = np.clip(raw_curv, -max_k, max_k)
        abs_curv = np.abs(raw_curv)
        abs_curv_series = pd.Series(abs_curv, index=ctx.idx)

        for w in [30, 60]:
            ws = self._scale(w)
            min_p = max(ws // 3, 1)
            feats[f"curv_mean_{w}"] = abs_curv_series.rolling(ws, min_periods=min_p).mean()

        angle = np.arctan2(vel_y, vel_x)
        angle_series = pd.Series(angle, index=ctx.idx)
        angle_change = np.abs(angle_series.diff().fillna(0.0))
        angle_change = np.where(angle_change > np.pi, 2 * np.pi - angle_change, angle_change)
        angle_change_series = pd.Series(angle_change, index=ctx.idx)
        angle_change_series = pd.Series(np.where(moving_mask, angle_change_series, 0.0), index=ctx.idx)

        ws = self._scale(30)
        feats["turn_rate_30"] = angle_change_series.rolling(ws, min_periods=max(ws // 3, 1)).sum()

        return feats
    
    def _feat_cumulative(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Tổng quãng đường di chuyển trong một khoảng thời gian dài xung quanh frame hiện tại.
        """
        feats = {}
        L = max(1, self._scale(180))
        step = np.hypot(ctx.cx.diff(), ctx.cy.diff()).fillna(0.0)
        path = step.rolling(2 * L + 1, min_periods=max(5, L // 6), center=True).sum()
        feats["path_cum180"] =  path.fillna(0.0).astype("float32")
        return feats

    def _feat_speed_asym(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Bất đối xứng tốc độ (Tương lai - Quá khứ).
        """
        w = max(3, self._scale(30))
        v = ctx.speed_series
        v_past = v.rolling(w, min_periods=1).mean()
        v_fut = self._roll_future_mean(v, w, min_p=1)
        return {"spd_asym_1s": (v_fut - v_past).fillna(0.0)}
    
    def _feat_gauss_shift(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Độ lệch Gaussian (KL Divergence) giữa quá khứ và tương lai.
        Đo lường sự thay đổi trạng thái thống kê.
        """
        w = max(5, self._scale(30))
        v = ctx.speed_series
        mu_p = v.rolling(w, min_periods=1).mean()
        va_p = v.rolling(w, min_periods=1).var().clip(lower=1e-6)
        mu_f = self._roll_future_mean(v, w, min_p=1)
        va_f = self._roll_future_var(v, w, min_p=1).clip(lower=1e-6)

        kl_pf = 0.5 * (
            (va_p / va_f) + ((mu_f - mu_p) ** 2) / va_f - 1.0 + np.log(va_f / va_p)
        )
        kl_fp = 0.5 * (
            (va_f / va_p) + ((mu_p - mu_f) ** 2) / va_p - 1.0 + np.log(va_p / va_f)
        )
        return {
            "spd_symkl_1s": (kl_pf + kl_fp).replace([np.inf, -np.inf], np.nan).fillna(0.0)
        }
    
    def _extract_part(self, ctx: AgentContext, part: str) -> Optional[np.ndarray]:
        if ctx.raw_df is None: return None
        if part not in ctx.raw_df.columns.get_level_values(0): return None
        try:
            sub_df = ctx.raw_df.xs(part, axis=1, level=0)[["x", "y"]].reindex(ctx.idx)
        except KeyError: return None
        raw = sub_df.to_numpy()
        raw = self._forward_fill_nan(raw)
        cm = self._to_cm(raw.astype(np.float32))
        return self._smooth(cm)
    
    def _extract_parts_dict(self, ctx: AgentContext, parts: List[str] = None) -> Dict[str, Optional[np.ndarray]]:
        out = {}
        for p in parts:
            out[p] = self._extract_part(ctx, p)
        return out
        
    def _feat_pose_shape(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Placeholder cho các đặc trưng hình dáng (Elongation, Body Angle...).
        """
        feats = {}

        def zero(): return pd.Series(0.0, index=ctx.idx, dtype="float32")

        def dist(k1, k2):
            p1, p2 = parts.get(k1), parts.get(k2)
            if p1 is None or p2 is None: return zero()
            d = np.linalg.norm(p1 - p2, axis=1)
            return pd.Series(d, index=ctx.idx, dtype="float32")
        
        def body_angle():
            if parts.get("nose") is None: return zero()
            if parts.get("neck") is None: return zero()
            if parts.get("tail_base") is None: return zero()

            v1 = parts.get("nose") - parts.get("neck")
            v2 = parts.get("tail_base") - parts.get("neck")
            dot_product = np.sum(v1 * v2, axis=1)
            mag = np.linalg.norm(v1, axis=1) * np.linalg.norm(v2, axis=1)
            cos_angle = np.clip(dot_product / (mag + 1e-6), -1.0, 1.0).astype("float32")
            return cos_angle
        
        def elongation():
            if parts.get("nose")          is None: return zero()
            if parts.get("tail_base")     is None: return zero()
            if parts.get("hip_left")  is None: return zero()
            if parts.get("hip_right") is None: return zero()

            d1 = dist("nose", "tail_base")
            d2 = dist("hip_left", "hip_right")
            elongation = d1 / (d2 + 1e-6).astype("float32")
            return elongation
        
        def part_speed(part: str, n_frames_30fps: int) -> Dict:
            part_pos = self._extract_part(ctx, part)
            if part_pos is None: return zero()
            
            s_x = pd.Series(part_pos[:, 0], index=ctx.idx)
            s_y = pd.Series(part_pos[:, 1], index=ctx.idx)
            raw_speed = self._speed_series(s_x, s_y)

            ws = self._scale(n_frames_30fps)
            val = raw_speed.rolling(ws, min_periods=1, center=True).mean()
            return val.astype("float32")


        target_parts = ["nose", "hip_left", "hip_right", "ear_left", "ear_right", "tail_base", "neck"]
        
        parts = self._extract_parts_dict(ctx, target_parts)

        feats["aa_nose_tailbase_dist"]       = dist("nose", "tail_base")
        feats["aa_earleft_tailbase_dist"]    = dist("ear_left", "tail_base")
        feats["aa_earright_tailbase_dist"]   = dist("ear_right", "tail_base")
        feats["aa_nose_earleft_dist"]        = dist("ear_left", "nose")
        feats["aa_nose_ear_right_dist"]      = dist("ear_right", "nose")
        feats["aa_nose_hip_left_dist"]       = dist("nose", "hip_left")
        feats["aa_nose_hip_right_dist"]      = dist("nose", "hip_right")
        feats["aa_neck_tailbase_dist"] = dist("neck", "tail_base")
        
        feats["a_elongation"]                = elongation()
        feats["a_bodyangle"]                 = body_angle()

        a_tail_base_vel_500ms     = part_speed("tail_base", 15)
        a_tail_base_vel_1000ms    = part_speed("tail_base", 30)
        a_tail_base_vel_2000ms    = part_speed("tail_base", 60)
        a_tail_base_vel_3000ms    = part_speed("tail_base", 90)


        a_hip_left_vel_500ms          = part_speed("hip_left", 15)
        a_hip_left_vel_1000ms         = part_speed("hip_left", 30)
        a_hip_left_vel_2000ms         = part_speed("hip_left", 60)
        a_hip_left_vel_3000ms         = part_speed("hip_left", 90)

        a_hip_right_vel_500ms          = part_speed("hip_left", 15)
        a_hip_right_vel_1000ms         = part_speed("hip_left", 30)
        a_hip_right_vel_2000ms         = part_speed("hip_left", 60)
        a_hip_right_vel_3000ms         = part_speed("hip_left", 90)

        feats["a_upper_vel_500ms"]            = (a_tail_base_vel_500ms + a_hip_left_vel_500ms + a_hip_right_vel_500ms)/3.0
        feats["a_upper_vel_1000ms"]           = (a_tail_base_vel_1000ms + a_hip_left_vel_1000ms + a_hip_right_vel_1000ms)/3.0
        feats["a_upper_vel_2000ms"]           = (a_tail_base_vel_2000ms + a_hip_left_vel_2000ms + a_hip_right_vel_2000ms)/3.0
        feats["a_upper_vel_3000ms"]           = (a_tail_base_vel_3000ms + a_hip_left_vel_3000ms + a_hip_right_vel_3000ms)/3.0


        feats["a_nose_vel_500ms"]            = part_speed("nose", 15)
        feats["a_nose_vel_1000ms"]           = part_speed("nose", 30)
        feats["a_nose_vel_2000ms"]           = part_speed("nose", 60)
        feats["a_nose_vel_3000ms"]           = part_speed("nose", 90)

        # feats["a_ear_right_vel_500ms"]       = part_speed("hip_right", 15)
        # feats["a_ear_right_vel_1000ms"]      = part_speed("hip_right", 30)
        # feats["a_ear_right_vel_2000ms"]      = part_speed("hip_right", 60)
        # feats["a_ear_right_vel_3000ms"]      = part_speed("hip_right", 90)
        # feats["a_ear_left_vel_500ms"]        = part_speed("ear_left", 15)
        # feats["a_ear_left_vel_1000ms"]       = part_speed("ear_left", 30)
        # feats["a_ear_left_vel_2000ms"]       = part_speed("ear_left", 60)
        # feats["a_ear_left_vel_3000ms"]       = part_speed("ear_left", 90)
        
        return feats

    def _feat_attack_sniff(
        self,
        ctx: AgentContext,
        target_ctx: AgentContext = None,
        **kwargs
    ) -> Dict[str, pd.Series]:
        """
        Đặc trưng phân biệt attack vs sniff cho lab 2-mouse (agent=1, target=2).
    
        Ý tưởng:
          - attack: speed 2 con biến động mạnh, đổi hướng nhiều, body overlap cao.
          - sniff : mũi gần cổ/thân, overlap thấp hơn, motion nhẹ/ổn định hơn.
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
    
        def zero():
            return pd.Series(0.0, index=idx, dtype="float32")

        # helper khoảng cách
        def dist(p1, p2):
            if p1 is None or p2 is None:
                return zero()
            d = np.linalg.norm(p1 - p2, axis=1)
            return pd.Series(d, index=idx, dtype="float32")

        parts_a = self._extract_parts_dict(ctx, ["nose", "tail_base"])
        parts_t = self._extract_parts_dict(target_ctx, ["nose", "tail_base"])
    
        # ---------------------------------------------------------
        # 2) ĐIỂM ĐẠI DIỆN THÂN (BODY CENTER) CHO MỖI CON
        #    dùng trung bình neck – hips – tail_base
        # ---------------------------------------------------------
    
        # ---------------------------------------------------------
        # 4) MỨC ĐỘ “BẠO LỰC”: DAO ĐỘNG TỐC ĐỘ & ĐỔI HƯỚNG
        # ---------------------------------------------------------
        # speed 2 con từ velocity
        a_speed = pd.Series(
            np.linalg.norm(ctx.vel, axis=1),
            index=idx,
            dtype="float32",
        )
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32",
        )

        ws_05 = self._scale(15)  # ~0.5s
        mp_05 = max(ws_05 // 3, 1)
    
        feats["as_a_speed_std_05"] = (
            a_speed.rolling(ws_05, min_periods=mp_05).std().fillna(0.0).astype("float32")
        )
        feats["as_t_speed_std_05"] = (
            t_speed.rolling(ws_05, min_periods=mp_05).std().fillna(0.0).astype("float32")
        )
        feats["as_speed_std_sum_05"] = (
            feats["as_a_speed_std_05"] + feats["as_t_speed_std_05"]
        )
    
        # Đổi hướng (jerk góc) của agent
        a_angle = np.arctan2(ctx.vel[:, 1], ctx.vel[:, 0])
        a_angle_diff = np.abs(np.diff(a_angle))
        a_angle_diff = np.where(
            a_angle_diff > np.pi, 2 * np.pi - a_angle_diff, a_angle_diff
        )
        a_angle_diff = np.concatenate([[0.0], a_angle_diff])
        a_angle_diff_s = pd.Series(a_angle_diff, index=idx, dtype="float32")
    
        feats["as_a_turn_jerk_05"] = (
            a_angle_diff_s.rolling(ws_05, min_periods=mp_05)
            .sum()
            .fillna(0.0)
            .astype("float32")
        )

        # ---------------------------------------------------------
        # 5) XẤP XỈ OVERLAP CƠ THỂ (BODY OVERLAP)
        #    dùng bbox từ các bộ phận thân
        # ---------------------------------------------------------
        def build_bbox(parts: Dict[str, Optional[np.ndarray]]):
            arrs = []
            for k in ["nose", "hip_left", "hip_right", "ear_left", "ear_right", "tail_base"]:
                if parts.get(k) is not None:
                    arrs.append(parts[k])
            if not arrs:
                return None
            stack = np.stack(arrs, axis=1)  # [F, K, 2]
            xs = stack[:, :, 0]
            ys = stack[:, :, 1]
            xmin = np.nanmin(xs, axis=1)
            xmax = np.nanmax(xs, axis=1)
            ymin = np.nanmin(ys, axis=1)
            ymax = np.nanmax(ys, axis=1)
            return np.stack([xmin, ymin, xmax, ymax], axis=1).astype("float32")
    
        def iou_box(box1: np.ndarray, box2: np.ndarray):
            # box: [F, 4] = (xmin, ymin, xmax, ymax)
            x1 = np.maximum(box1[:, 0], box2[:, 0])
            y1 = np.maximum(box1[:, 1], box2[:, 1])
            x2 = np.minimum(box1[:, 2], box2[:, 2])
            y2 = np.minimum(box1[:, 3], box2[:, 3])
    
            inter_w = np.clip(x2 - x1, 0.0, None)
            inter_h = np.clip(y2 - y1, 0.0, None)
            inter = inter_w * inter_h
    
            area1 = (box1[:, 2] - box1[:, 0]) * (box1[:, 3] - box1[:, 1])
            area2 = (box2[:, 2] - box2[:, 0]) * (box2[:, 3] - box2[:, 1])
            union = area1 + area2 - inter + 1e-6
            iou = inter / union
            return iou.astype("float32")

        bbox_a = build_bbox(parts_a)
        bbox_t = build_bbox(parts_t)
        if bbox_a is not None and bbox_t is not None:
            iou = iou_box(bbox_a, bbox_t)
            iou_s = pd.Series(iou, index=idx, dtype="float32")
    
            feats["as_body_iou"] = iou_s
    
            ws_1s = self._scale(30)
            mp_1s = max(ws_1s // 3, 1)
            feats["as_body_iou_mean_1s"] = (
                iou_s.rolling(ws_1s, min_periods=mp_1s).mean().fillna(0.0).astype("float32")
            )
        else:
            feats["as_body_iou"] = zero()
            feats["as_body_iou_mean_1s"] = zero()
    
        # ---------------------------------------------------------
        # 6) DỌN NẠN NaN / Inf
        # ---------------------------------------------------------
        for k, v in feats.items():
            feats[k] = (
                v.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats

    def _feat_climb(self, ctx: AgentContext, **kwargs) -> Dict[str, pd.Series]:
        """
        Feature chuyên cho hành vi climb trong arena hình chữ nhật (33 x 19 cm).
    
        Ý tưởng:
          - Chuột đi gần tường: dist_wall giảm nhanh.
          - Khi climb: sát tường (dist_wall nhỏ), v_normal ~ 0,
            nhưng vẫn có v_tangent (bò ngang trên tường / di chuyển dọc biên).
        """
        feats: Dict[str, pd.Series] = {}
        idx = ctx.idx
    
        def zero() -> pd.Series:
            return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- 1. Arena size (cm) ---
        # Nếu bạn đã set trong FeatureConfig thì dùng:
        # W = self.cfg.arena_width_cm or 33.0
        # H = self.cfg.arena_height_cm or 19.0
        # Ở đây fix luôn cho lab này:
        W = 28.0
        H = 18.0
        parts = self._extract_parts_dict(ctx, ["nose"])
        head = parts.get("nose")
        
        if head is not None:
            # head đã ở đơn vị cm (vì _extract_part đã to_cm + smooth)
            cx = pd.Series(head[:, 0], index=idx)
            cy = pd.Series(head[:, 1], index=idx)
        else:
            # fallback: nếu không có head thì dùng body_center như cũ
            cx = ctx.cx
            cy = ctx.cy


        # # --- 2. Khoảng cách tới 4 bức tường ---
        # cx = ctx.cx  # Series
        # cy = ctx.cy  # Series
    
        dist_left   = cx - 0.0
        dist_right  = W - cx
        dist_bottom = cy - 0.0
        dist_top    = H - cy
    
        d_all = np.stack(
            [dist_left.values, dist_right.values, dist_bottom.values, dist_top.values],
            axis=1,  # [F, 4]
        )
    
        dist_wall = np.min(d_all, axis=1)          # khoảng cách tới tường gần nhất
        wall_idx  = np.argmin(d_all, axis=1)       # 0:left, 1:right, 2:bottom, 3:top
    
        dist_wall_s = pd.Series(dist_wall, index=idx, dtype="float32")
        feats["climb_dist_wall"] = dist_wall_s
    
        # --- 3. Vận tốc theo NORMAL & TANGENT của tường gần nhất ---
        vx = ctx.vel[:, 0]
        vy = ctx.vel[:, 1]
    
        # normal hướng VÀO trong arena từ tường
        nx = np.zeros_like(vx, dtype="float32")
        ny = np.zeros_like(vy, dtype="float32")

        # left  wall (x=0)    → normal = (+1, 0)
        # right wall (x=W)    → normal = (-1, 0)
        # bottom wall (y=0)   → normal = (0, +1)
        # top wall (y=H)      → normal = (0, -1)
        nx[wall_idx == 0] =  1.0
        nx[wall_idx == 1] = -1.0
        ny[wall_idx == 2] =  1.0
        ny[wall_idx == 3] = -1.0
    
        # v_normal = v ⋅ n
        v_normal = vx * nx + vy * ny
    
        # thành phần song song tường: v_tan = v - (v⋅n)n
        v_proj_x = v_normal * nx
        v_proj_y = v_normal * ny
        v_tan_x = vx - v_proj_x
        v_tan_y = vy - v_proj_y
        v_tangent = np.sqrt(v_tan_x ** 2 + v_tan_y ** 2)
    
        v_normal_s  = pd.Series(v_normal,  index=idx, dtype="float32")
        v_tangent_s = pd.Series(v_tangent, index=idx, dtype="float32")
    
        feats["climb_normal_vel"]  = v_normal_s
        feats["climb_tangent_vel"] = v_tangent_s
    
        # --- 4. Approach speed: dist_wall giảm mạnh (lao vào tường) ---
        ws = self._scale(15)  # ~0.5s (15 frame ở 30fps)
        min_p = max(ws // 3, 1)

        # diff_dw > 0 khi dist_wall giảm (đi về phía tường)
        diff_dw = -dist_wall_s.diff().fillna(0.0)  # dấu trừ để "giảm" → dương
        approach = diff_dw.rolling(ws, min_periods=min_p).mean()
        feats["climb_approach_speed_wall"] = approach.astype("float32")
    
        # --- 5. Stick score: sát tường + không còn lao vào (v_normal nhỏ) ---
        # gần tường
        thr_cm = 3.0  # tuỳ chỉnh (3cm sát tường)
        near_wall = (dist_wall_s < thr_cm).astype("float32")
    
        # ít lao vào nữa: |v_normal| nhỏ
        stick = near_wall * (1.0 / (1.0 + v_normal_s.abs()))

        # Nếu muốn climb thực sự có chút chuyển động dọc tường:
        # yêu cầu v_tangent > một ngưỡng nhỏ (ví dụ 0.5 cm/s)
        stick = stick * (v_tangent_s > 0.5).astype("float32")
    
        feats["climb_wall_stick_score"] = stick.astype("float32")
    
        # --- 6. Clean NaN/Inf ---
        for k, v in feats.items():
            feats[k] = (
                v.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats


    def _feat_pairwise(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict:
        """
        Đặc trưng tương tác cặp đôi (Pairwise): Khoảng cách, Tốc độ tiếp cận.
        """
        feats = {}
        if target_ctx is None: 
            return feats

        def zero(): return pd.Series(0.0, index=ctx.idx, dtype="float32")

        def dist_ab(pt_a, pt_b):
            if pt_a is None or pt_b is None: return zero()
            d = np.linalg.norm(pt_a - pt_b, axis=1)
            return pd.Series(d, index=ctx.idx, dtype="float32")

        rel_vec = target_ctx.pos - ctx.pos
        dist = np.linalg.norm(rel_vec, axis=1)
        feats["rel_dist"] = pd.Series(dist, index=ctx.idx, dtype="float32")

        # Khoảng cách
        my_parts = self._extract_parts_dict(ctx, ["nose", "ear_left", "ear_right", "body_center", "tail_base", "hip_left", "hip_right", "neck"])
        target_parts = self._extract_parts_dict(target_ctx, ["nose", "ear_left", "ear_right", "body_center", "tail_base", "hip_left", "hip_right", "neck"])

        an, tn = my_parts["nose"], target_parts["nose"]
        feats["dist_nose_nose"] = dist_ab(an, tn)
        feats["dist_nose_tail"] = dist_ab(an, target_parts["tail_base"])
        feats["dist_nose_el"]   = dist_ab(an, target_parts["ear_left"])
        feats["dist_nose_er"]   = dist_ab(an, target_parts["ear_right"])
        feats["dist_nose_hip_l"] = dist_ab(an, target_parts["hip_left"])
        feats["dist_nose_hip_r"] = dist_ab(an, target_parts["hip_right"])
        feats["dist_nose_neck"] = dist_ab(an, target_parts["neck"])

        
        #  Hướng - góc nhìn
        def get_body_vec(parts_dict):
            head = parts_dict.get("nose")
            tail = parts_dict.get("tail_base")
            if head is not None and tail is not None:
                return head - tail
            return None

        a_vec = get_body_vec(my_parts)
        t_vec = get_body_vec(target_parts)

        if a_vec is not None and t_vec is not None:
            dot = np.sum(a_vec * t_vec, axis=1)
            mags = np.linalg.norm(a_vec, axis=1) * np.linalg.norm(t_vec, axis=1)
            feats["body_cosine"] = pd.Series(
                np.clip(dot / (mags + 1e-6), -1.0, 1.0), index=ctx.idx, dtype="float32"
            )
        else:
            feats["body_cosine"] = zero()

        # Vector ánh nhìn = Target_Pos - My_Pos = rel_vec
        if a_vec is not None:
            dot_gaze = np.sum(a_vec * rel_vec, axis=1)
            mag_a = np.linalg.norm(a_vec, axis=1)
            feats["gaze_cosine"] = pd.Series(
                np.clip(dot_gaze / (mag_a * dist + 1e-6), -1.0, 1.0),
                index=ctx.idx, dtype="float32"
            )
        else:
            feats["gaze_cosine"] = zero()

        # Vector đơn vị hướng về địch (u)
        dist_safe = dist.copy()
        dist_safe[dist_safe == 0] = 1e-6
        u_vec = rel_vec / dist_safe[:, None]

        # a_vel và t_vel lấy từ Context
        a_vel, t_vel = ctx.vel, target_ctx.vel

        # A. Approach Speed (Vận tốc dọc trục nối 2 con)
        # Dương: Lao vào nhau | Âm: Chạy ra xa nhau
        a_along = np.sum(a_vel * u_vec, axis=1)
        t_along = np.sum(t_vel * (-u_vec), axis=1) # Target hướng ngược lại
        rel_along = np.sum((a_vel - t_vel) * u_vec, axis=1)

        # B. Lateral Speed (Vận tốc ngang - Vuông góc trục nối)
        # Vector chiếu: v_proj = (v . u) * u
        a_proj = a_along[:, None] * u_vec
        a_lat_vec = a_vel - a_proj
        a_lat_speed = np.linalg.norm(a_lat_vec, axis=1)

        feats["approach_speed_agent"]  = pd.Series(a_along, index=ctx.idx, dtype="float32")
        feats["approach_speed_target"] = pd.Series(t_along, index=ctx.idx, dtype="float32")
        feats["approach_speed_rel"]    = pd.Series(rel_along, index=ctx.idx, dtype="float32")
        feats["lateral_speed_agent"]   = pd.Series(a_lat_speed, index=ctx.idx, dtype="float32")

        return feats


    def _feat_ejaculate_temporal(
        self,
        ctx: AgentContext,
        target_ctx: AgentContext = None,
        **kwargs
    ) -> Dict[str, pd.Series]:
        """
        Đặc trưng cho hành vi 'ejaculate' (pair):
          - 2 con dính sát, agent gần vùng đuôi/genital của target.
          - Trước đó có giai đoạn hoạt động mạnh (mount/intromit/thrust).
          - Thời điểm ejaculate: agent gần như đứng yên nhưng vẫn sát target.
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
    
        def zero() -> pd.Series:
            return pd.Series(0.0, index=idx, dtype="float32")
    
        # -------------------------------------------------
        # 1. PARTS: APPROX GENITAL & BODY
        # -------------------------------------------------
        # Agent: dùng body_center + nose
        parts_a = self._extract_parts_dict(
            ctx,
            ["nose", "body_center", "tail_base", "hip_left", "hip_right"]
        )
        # Target: genital ~ tail_base, thân ~ body_center
        parts_t = self._extract_parts_dict(
            target_ctx,
            ["body_center", "tail_base"]
        )
    
        a_nose = parts_a.get("nose")
        a_bc   = parts_a.get("body_center")
        a_tail = parts_a.get("tail_base")
        t_bc   = parts_t.get("body_center")
        t_tail = parts_t.get("tail_base")

        # fallback body_center nếu thiếu
        if a_bc is None and a_tail is not None:
            a_bc = a_tail
        if t_bc is None and t_tail is not None:
            t_bc = t_tail
    
        def dist_series(p1: Optional[np.ndarray],
                        p2: Optional[np.ndarray]) -> pd.Series:
            if p1 is None or p2 is None:
                return zero()
            d = np.linalg.norm(p1 - p2, axis=1).astype("float32")
            return pd.Series(d, index=idx, dtype="float32")
    
        # khoảng cách thân–thân và agent body → target genital
        dist_body = dist_series(a_bc, t_bc)          # "ôm" nhau chặt hay không
        dist_gen  = dist_series(a_bc, t_tail)       # agent body gần đuôi target
        dist_nose_gen = dist_series(a_nose, t_tail) # mũi agent gần genital
    
        feats["ejac_dist_body"]      = dist_body
        feats["ejac_dist_gen_body"]  = dist_gen
        feats["ejac_dist_gen_nose"]  = dist_nose_gen
    
        # -------------------------------------------------
        # 2. PROXIMITY SCORE (khoảng cách nhỏ → score lớn)
        # -------------------------------------------------
        # scale ~ 5 cm, có thể chỉnh nếu arena nhỏ/lớn
        prox_body = np.exp(-dist_body.to_numpy() / 5.0).astype("float32")
        prox_gen  = 1.0 / (1.0 + dist_gen.to_numpy())
        prox_nose = 1.0 / (1.0 + dist_nose_gen.to_numpy())
    
        feats["ejac_prox_body"] = pd.Series(prox_body, index=idx, dtype="float32")
        feats["ejac_prox_gen"]  = pd.Series(prox_gen,  index=idx, dtype="float32")
        feats["ejac_prox_nose_gen"] = pd.Series(prox_nose, index=idx, dtype="float32")
    
        # -------------------------------------------------
        # 3. BUILD-UP MEMORY: HOẠT ĐỘNG MẠNH TRƯỚC ĐÓ
        # -------------------------------------------------
        # dung speed của agent nhưng chỉ tính khi đang dính sát body
        v = ctx.speed_series  # cm/s
        close_mask = (dist_body < 5.0).astype("float32")  # ở rất gần
        v_contact = (v * close_mask).astype("float32")

        ws_mem = self._scale(90)  # ~3s
        ws_mem = max(ws_mem, 1)
    
        ejac_mem = (
            v_contact.rolling(ws_mem, min_periods=1)
                     .max()
                     .fillna(0.0)
                     .astype("float32")
        )
        feats["ejac_activity_memory_3s"] = ejac_mem
    
        # -------------------------------------------------
        # 4. HIỆN TẠI: ĐỨNG YÊN NHƯNG VẪN DÍNH SÁT
        # -------------------------------------------------
        # agent gần như đứng yên
        is_still = (v < 1.5).astype("float32")  # ngưỡng speed thấp, tuỳ lab
        feats["ejac_is_still"] = is_still
    
        # khoảng cách ổn định (không kéo xa/đẩy gần quá nhanh)
        dist_body_diff = dist_body.diff().abs().fillna(0.0)
        feats["ejac_dist_body_diff"] = dist_body_diff.astype("float32")
    
        # -------------------------------------------------
        # 5. FINAL SCORE (gợi ý): cao khi ejaculate
        # -------------------------------------------------
        # điều kiện:
        #  - trước đó hoạt động mạnh (ejac_mem lớn)
        #  - bây giờ đứng yên (is_still ~1)
        #  - vẫn dính sát, gần vùng genital
        prox_comb = (
            feats["ejac_prox_body"] *
            feats["ejac_prox_gen"]  *
            feats["ejac_prox_nose_gen"]
        )
    
        feats["ejac_static_score"] = (
            is_still * prox_comb * ejac_mem
        ).astype("float32")
    
        # -------------------------------------------------
        # 6. CLEAN NaN / Inf
        # -------------------------------------------------
        for k, s in feats.items():
            feats[k] = (
                s.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats


    def _feat_follow_pattern(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        """
        Đặc trưng hành vi FOLLOW:
          - Agent ở gần target
          - Cùng hướng (body + velocity)
          - Tốc độ vừa phải
          - Khoảng cách tương đối ổn định trong 0.5–1s
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- 1. CÁC ĐẠI LƯỢNG CƠ BẢN ---
        # Vector Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        rel_dist = np.linalg.norm(rel_vec, axis=1)
        rel_dist_s = pd.Series(rel_dist, index=idx, dtype="float32")
    
        # Speed agent/target
        a_speed = ctx.speed_series.astype("float32")
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32",
        )
    
        # Body vector: nose - tail/body_center
        parts_a = self._extract_parts_dict(ctx, ["nose", "ear_left", "ear_right", "body_center", "tail_base", "hip_left", "hip_right", "neck"])
        parts_t = self._extract_parts_dict(target_ctx, ["nose", "ear_left", "ear_right", "body_center", "tail_base", "hip_left", "hip_right", "neck"])
    
        def body_vec(parts_dict):
            head = parts_dict.get("nose")
            tail = parts_dict.get("tail_base")
            if head is None or tail is None:
                return None
            return head - tail
    
        a_body = body_vec(parts_a)
        t_body = body_vec(parts_t)
    
        if a_body is not None and t_body is not None:
            dot_bt = np.sum(a_body * t_body, axis=1)
            mag_bt = np.linalg.norm(a_body, axis=1) * np.linalg.norm(t_body, axis=1)
            cos_body = np.clip(dot_bt / (mag_bt + 1e-6), -1.0, 1.0)
            cos_body_s = pd.Series(cos_body, index=idx, dtype="float32")
        else:
            cos_body_s = zero()
    
        # Velocity hướng
        a_vel = ctx.vel
        t_vel = target_ctx.vel
        a_speed_np = np.linalg.norm(a_vel, axis=1)
        t_speed_np = np.linalg.norm(t_vel, axis=1)
        moving_mask = (a_speed_np > 1e-3) & (t_speed_np > 1e-3)
    
        # cos giữa hướng velocity 2 con
        dot_v = np.sum(a_vel * t_vel, axis=1)
        mag_v = a_speed_np * t_speed_np + 1e-6
        cos_vel = np.zeros_like(dot_v, dtype="float32")
        cos_vel[moving_mask] = np.clip(dot_v[moving_mask] / mag_v[moving_mask], -1.0, 1.0)
        cos_vel_s = pd.Series(cos_vel, index=idx, dtype="float32")
    
        # --- 2. WINDOW NGẮN (FOLLOW LÀ PATTERN DÀI HƠN ATTACK) ---
        for w30 in [15, 30, 60]:   # ~0.5s, 1s, 2s
            ws = self._scale(w30)
            min_p = max(ws // 3, 1)
    
            # Khoảng cách trung bình & độ dao động
            m_dist = rel_dist_s.rolling(ws, min_periods=min_p).mean()
            s_dist = rel_dist_s.rolling(ws, min_periods=min_p).std()
    
            # Cùng hướng (body + velocity)
            m_cos_body = cos_body_s.rolling(ws, min_periods=min_p).mean()
            m_cos_vel  = cos_vel_s.rolling(ws, min_periods=min_p).mean()
    
            # Tốc độ vừa phải
            m_sp_a = a_speed.rolling(ws, min_periods=min_p).mean()
            m_sp_t = t_speed.rolling(ws, min_periods=min_p).mean()
    
            feats[f"follow_dist_mean_{w30}"] = m_dist
            feats[f"follow_dist_std_{w30}"]  = s_dist
            feats[f"follow_cos_body_mean_{w30}"] = m_cos_body
            feats[f"follow_cos_vel_mean_{w30}"]  = m_cos_vel
            feats[f"follow_speed_agent_mean_{w30}"] = m_sp_a
            feats[f"follow_speed_target_mean_{w30}"] = m_sp_t
    
        # Clean
        for k, v in feats.items():
            feats[k] = (
                v.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats
    
    def _feat_shortburst_social(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        feats = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- Lấy lại vài quantity cơ bản từ pairwise/avoidance ---
        # vector Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        rel_dist = np.linalg.norm(rel_vec, axis=1)
        rel_dist_s = pd.Series(rel_dist, index=idx, dtype="float32")
    
        # unit vector
        rel_dist_safe = np.where(rel_dist == 0, 1e-6, rel_dist)
        u_vec = rel_vec / rel_dist_safe[:, None]
    
        # velocity dọc trục nối (approach speed)
        a_vel = ctx.vel
        t_vel = target_ctx.vel
        a_along = np.sum(a_vel * u_vec, axis=1)                # +: lao vào target
        t_along = np.sum(t_vel * (-u_vec), axis=1)             # +: target lao vào agent
        rel_along = np.sum((a_vel - t_vel) * u_vec, axis=1)    # +: lại gần nhau
    
        a_along_s = pd.Series(a_along, index=idx, dtype="float32")
        t_along_s = pd.Series(t_along, index=idx, dtype="float32")
        rel_along_s = pd.Series(rel_along, index=idx, dtype="float32")
    
        # speed agent / target
        a_speed = ctx.speed_series
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32"
        )
    
        # heading_rel_cos ~ escape / approach
        # vector body của agent
        # (reuse idea từ _feat_pairwise)
        # head ~ nose, tail ~ tail_base/body_center
        parts_a = self._extract_parts_dict(ctx, ["nose", "tail_base"])
        head_a = parts_a.get("nose")
        tail_a = parts_a.get("tail_base")
    
        if head_a is not None and tail_a is not None:
            body_vec_a = head_a - tail_a
            dot = np.sum(body_vec_a * rel_vec, axis=1)
            mag = np.linalg.norm(body_vec_a, axis=1) * rel_dist_safe
            heading_cos = np.clip(dot / (mag + 1e-6), -1.0, 1.0)
            heading_cos_s = pd.Series(heading_cos, index=idx, dtype="float32")
        else:
            heading_cos_s = zero()
    
        # --- Rolling window 10, 20, 30 frames (ở fps gốc) ---
        for w30 in [10, 20, 30]:
            ws = self._scale(w30)
            min_p = max(1, ws // 3)
    
            # Attack-like: approach mạnh, khoảng cách giảm nhanh
            feats[f"sb_att_approach_mean_{w30}"] = a_along_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_att_rel_along_mean_{w30}"] = rel_along_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_att_dist_delta_{w30}"] = (rel_dist_s - rel_dist_s.shift(ws)).fillna(0.0)
    
            # Chase-like: agent & target đều nhanh, dist tương đối nhỏ
            feats[f"sb_chase_speed_agent_mean_{w30}"] = a_speed.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_chase_speed_target_mean_{w30}"] = t_speed.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_chase_dist_mean_{w30}"] = rel_dist_s.rolling(ws, min_periods=min_p).mean()
    
            # Escape-like: heading ngược, dist tăng nhanh
            feats[f"sb_esc_heading_cos_mean_{w30}"] = heading_cos_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_esc_dist_gain_{w30}"] = (rel_dist_s.shift(-ws) - rel_dist_s).fillna(0.0)
    
        # clip & fillna
        for k, v in feats.items():
            feats[k] = v.replace([np.inf, -np.inf], np.nan).fillna(0.0).astype("float32")
    
        return feats

    
    def build_pose_tensor(self, tracking: pd.DataFrame):
        """
        Chuyển dữ liệu tracking (DataFrame) sang Tensor [Frames, Mice, 2] và Dict chi tiết.
        """
        tracking = tracking.sort_values("video_frame")
        frames = np.sort(tracking["video_frame"].unique())
        
        pvid = tracking.pivot(
            index="video_frame", 
            columns=["mouse_id", "bodypart"], 
            values=["x", "y"]
        )
        pvid = pvid.reorder_levels([1, 2, 0], axis=1).sort_index(axis=1).astype("float32")
        mouse_ids = list(pvid.columns.get_level_values(0).unique())
        pos = np.full((len(frames), len(mouse_ids), 2), np.nan, dtype=np.float32)
        per_mouse_df = {}
        
        for i, mid in enumerate(mouse_ids):
            single = pvid[mid]
            per_mouse_df[mid] = single
            
            if "body_center" in single.columns.get_level_values(0):
                cx = single["body_center"]["x"]
                cy = single["body_center"]["y"]
            else:
                cx = single.xs("x", level=1, axis=1).mean(axis=1)
                cy = single.xs("y", level=1, axis=1).mean(axis=1)
            
            pos[:, i, 0] = cx.reindex(frames).values
            pos[:, i, 1] = cy.reindex(frames).values
            
        return frames, mouse_ids, pos, per_mouse_df

    def extract_agent_target(
        self, 
        frames: np.ndarray, 
        mouse_ids: List[Any], 
        pos: np.ndarray, 
        agent_id: Any, 
        target_id: Any, 
        per_mouse_df: Dict = None
    ) -> pd.DataFrame:
        """
        Trích xuất đặc trưng cho cặp (Agent, Target).
        """
        try:
            aid_idx = mouse_ids.index(agent_id)
        except ValueError:
            return pd.DataFrame() 

        # 1. Build Agent Context
        ctx_agent = self._build_context(
            frames, 
            pos[:, aid_idx, :], 
            per_mouse_df.get(agent_id) if per_mouse_df else None
        )

        # 2. Build Target Context
        ctx_target = None
        if self.cfg.use_pairwise and target_id is not None and target_id in mouse_ids:
             tid_idx = mouse_ids.index(target_id)
             ctx_target = self._build_context(
                 frames, 
                 pos[:, tid_idx, :], 
                 per_mouse_df.get(target_id) if per_mouse_df else None
             )

        # 3. Run all features
        all_data = {}
        for func_name, func in self.feature_registry.items():
            out_dict = func(ctx_agent, target_ctx=ctx_target)
            all_data.update(out_dict)

        df_out = pd.DataFrame(all_data, index=ctx_agent.idx)
        df_out = df_out.replace([np.inf, -np.inf], np.nan).fillna(0.0)
        
        return df_out.reindex(sorted(df_out.columns), axis=1)


from __future__ import annotations
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import gc
import itertools
import json
import time
import warnings
import joblib
from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedGroupKFold

# === IMPORT MODEL & OPTUNA ===
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
import optuna

# Cấu hình
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings('ignore', category=UserWarning)
np.seterr(invalid="ignore", divide="ignore")

# Metric
import sys
sys.path.append("/kaggle/usr/lib/mabe-f-beta")
try:
    from metric import score
except ImportError:
    def score(*args, **kwargs): return 0.0

# =========================================================
# 1. CẤU HÌNH & SEED
# =========================================================
SEED = 42
def seed_everything(seed=42):
    np.random.seed(seed)
seed_everything(SEED)

INPUT_DIR = Path("/kaggle/input/MABe-mouse-behavior-detection")
TRAIN_TRACKING_DIR = INPUT_DIR / "train_tracking"
TRAIN_ANNOTATION_DIR = INPUT_DIR / "train_annotation"
TEST_TRACKING_DIR = INPUT_DIR / "test_tracking"

WORKING_DIR = Path("/kaggle/working")
RESULTS_DIR = Path(r"/kaggle/input/results-ensemble-optuna3")
RESULTS_DIR.mkdir(exist_ok=True, parents=True)

SELF_BEHAVIORS = ["biteobject", "climb", "dig", "exploreobject", "freeze", "genitalgroom", "huddle", "rear", "rest", "run", "selfgroom"]
PAIR_BEHAVIORS = ["allogroom", "approach", "attack", "attemptmount", "avoid", "chase", "chaseattack", "defend", "disengage", "dominance", "dominancegroom", "dominancemount", "ejaculate", "escape", "flinch", "follow", "intromit", "mount", "reciprocalsniff", "shepherd", "sniff", "sniffbody", "sniffface", "sniffgenital", "submit", "tussle"]
BAD_VIDEOS = []

# =========================================================
# 2. DATA LOADING & PREPARATION (NO CACHE)
# =========================================================

def load_metadata() -> pd.DataFrame:
    return pd.read_csv(INPUT_DIR / "train.csv")

def get_video_params(video_id: Any, meta: pd.DataFrame) -> Tuple[float, float]:
    row = meta.loc[meta["video_id"] == video_id]
    if row.empty: return 30.0, 1.0
    row = row.iloc[0]
    return float(row["frames_per_second"]), float(row["pix_per_cm_approx"])

def load_tracking(lab_id: str, video_id: Any, is_test=False) -> pd.DataFrame:
    d = TEST_TRACKING_DIR if is_test else TRAIN_TRACKING_DIR
    path = d / str(lab_id) / f"{video_id}.parquet"
    if not path.exists(): raise FileNotFoundError(path)
    return pd.read_parquet(path)

def load_annotation(lab_id: str, video_id: Any) -> pd.DataFrame:
    path = TRAIN_ANNOTATION_DIR / str(lab_id) / f"{video_id}.parquet"
    if not path.exists(): return pd.DataFrame(columns=["agent_id", "target_id", "action", "start_frame", "stop_frame"])
    return pd.read_parquet(path)[["agent_id", "target_id", "action", "start_frame", "stop_frame"]]

# Hàm lấy feature KHÔNG CACHE để tránh tràn RAM
def get_frame_features_no_cache(lab_id, video_id, agent_id, target_id, meta, is_test=False):
    if is_test:
        row = meta[meta["video_id"] == video_id].iloc[0]
        fps, pix = float(row["frames_per_second"]), float(row["pix_per_cm_approx"])
        pix = pix if np.isfinite(pix) and pix > 0 else 1.0
    else:
        fps, pix = get_video_params(video_id, meta)

    tracking = load_tracking(lab_id, video_id, is_test)
    
    # === GỌI CLASS FeatureExtractor (Đã có ở cell trước) ===
    fe = FeatureExtractor(fps=fps, pix_per_cm=pix, smooth_sigma=1.0, use_pairwise=True)
    
    frames, mouse_ids, pos, per_mouse_df = fe.build_pose_tensor(tracking)
    
    features_df = fe.extract_agent_target(
        frames=frames, mouse_ids=mouse_ids, pos=pos,
        agent_id=agent_id, target_id=target_id, per_mouse_df=per_mouse_df
    )
    features_df.index = frames
    return frames, features_df

def build_frame_dataset_for_lab_behavior(lab_id, behavior, train_meta, mode="self"):
    videos = train_meta[train_meta["lab_id"] == lab_id]["video_id"].unique().tolist()
    index_list, feature_list, label_list = [], [], []

    for video_id in videos:
        ann = load_annotation(lab_id, video_id)
        if ann.empty: continue
        
        ann_bhv = ann[ann["action"] == behavior]
        if ann_bhv.empty: continue

        pairs = ann_bhv[["agent_id", "target_id"]].drop_duplicates().values.tolist()
        for (agent_id, target_id) in pairs:
            target_id_use = agent_id if mode == "self" else target_id
            
            # Lấy features (tính trực tiếp)
            frames, feat_df = get_frame_features_no_cache(lab_id, video_id, agent_id, target_id_use, train_meta)

            ann_pair = ann_bhv[(ann_bhv["agent_id"] == agent_id) & (ann_bhv["target_id"] == target_id)]
            if ann_pair.empty and mode == "self": ann_pair = ann_bhv[ann_bhv["agent_id"] == agent_id]

            pos_frames = set()
            for _, r in ann_pair.iterrows(): pos_frames.update(range(int(r["start_frame"]), int(r["stop_frame"])))
            
            if not pos_frames: continue
            label = np.isin(frames, list(pos_frames)).astype("int8")
            if label.sum() == 0: continue

            # Lưu vào list và reset index ngay để giảm memory overhead
            index_list.append(pd.DataFrame({"video_id": video_id, "agent_id": agent_id, "target_id": target_id, "video_frame": frames}))
            feature_list.append(feat_df.reset_index(drop=True))
            label_list.append(label)
            
            # Dọn dẹp ngay
            del frames, feat_df, label

    if not index_list: return pd.DataFrame(), pd.DataFrame(), np.zeros(0, dtype="int8")
    
    return pd.concat(index_list, ignore_index=True), pd.concat(feature_list, ignore_index=True), np.concatenate(label_list).astype("int8")

# =========================================================
# 3. TRAINING & ENSEMBLE HELPERS
# =========================================================

def train_catboost_fold(X_tr, y_tr, X_va, y_va, sw=1.0):
    p = {
        'iterations': 1000, 'learning_rate': 0.05, 'depth': 6, 'scale_pos_weight': sw,
        'task_type': 'GPU', 'devices': '0', 'verbose': 0, 'allow_writing_files': False,
        'l2_leaf_reg': 5, 'bootstrap_type': 'Bernoulli', 'subsample': 0.8, 'random_seed': SEED
    }
    m = cb.CatBoostClassifier(**p)
    m.fit(cb.Pool(X_tr, y_tr), eval_set=cb.Pool(X_va, y_va), early_stopping_rounds=20, use_best_model=True)
    return m

def train_lightgbm_fold(X_tr, y_tr, X_va, y_va, sw=1.0):
    p = {
        'objective': 'binary', 'metric': 'binary_logloss', 'learning_rate': 0.05,
        'max_depth': 6, 'num_leaves': 31, 'scale_pos_weight': sw, 'device': 'gpu',
        'verbosity': -1, 'min_child_weight': 5, 'subsample': 0.8, 'colsample_bytree': 0.8,
        'subsample_freq': 1, 'seed': SEED
    }
    m = lgb.train(p, lgb.Dataset(X_tr, y_tr), 1000, valid_sets=[lgb.Dataset(X_va, y_va)], callbacks=[lgb.early_stopping(20, verbose=False)])
    return m

def optimize_ensemble_weights(oof_dict, y_true):
    models = list(oof_dict.keys())
    def obj(trial):
        w = [trial.suggest_float(m, 0.0, 1.0) for m in models]
        s = sum(w) + 1e-6; w = [x/s for x in w]
        p = np.zeros_like(y_true, dtype=float)
        for i, m in enumerate(models): p += oof_dict[m] * w[i]
        th = trial.suggest_float("th", 0.1, 0.9)
        return f1_score(y_true, (p >= th).astype(int), zero_division=0)
    
    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=SEED))
    study.optimize(obj, n_trials=50)
    best = study.best_params
    th = best.pop("th")
    rw = [best[m] for m in models]; s = sum(rw)+1e-6
    return {m: w/s for m, w in zip(models, rw)}, th

def train_validate_ensemble(lab_id, behavior, indices, features, labels):
    res_dir = RESULTS_DIR / lab_id / behavior
    res_dir.mkdir(parents=True, exist_ok=True)

    if len(labels) == 0 or labels.sum() == 0: return 0.0

    X = features.values.astype("float32")
    y = labels.astype("int8")
    groups = indices["video_id"].values
    
    oof_preds = {m: np.zeros(len(y), dtype="float32") for m in ["xgb", "cat", "lgb"]}
    folds = np.ones(len(y), dtype="int8") * -1

    cv = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=SEED)
    for fold, (tr_idx, va_idx) in enumerate(cv.split(X, y, groups=groups)):
        print(f"   Fold {fold}...", end=" ")
        fd_dir = res_dir / f"fold_{fold}"; fd_dir.mkdir(parents=True, exist_ok=True)
        X_tr, y_tr = X[tr_idx], y[tr_idx]; X_va, y_va = X[va_idx], y[va_idx]
        pos = y_tr.sum(); neg = len(y_tr) - pos
        sw = float(neg/pos) if pos > 0 else 1.0

        # 1. XGBoost
        dtr = xgb.QuantileDMatrix(X_tr, label=y_tr, feature_names=features.columns.tolist(), max_bin=64)
        dva = xgb.DMatrix(X_va, label=y_va, feature_names=features.columns.tolist())
        xp = {
            "objective":"binary:logistic", "eval_metric":"logloss", "device":"cuda", 
            "tree_method":"hist", "learning_rate":0.05, "max_depth":6, "scale_pos_weight":sw,
            "min_child_weight":5, "subsample":0.8, "colsample_bytree":0.8, "max_bin":64, "seed": SEED
        }
        
        # === ĐÃ THÊM 'evals=' VÀO DÒNG DƯỚI ===
        mx = xgb.train(
            params=xp, 
            dtrain=dtr, 
            num_boost_round=1000, 
            evals=[(dva, "valid")],
            callbacks=[xgb.callback.EarlyStopping(rounds=20, save_best=True)], 
            verbose_eval=False
        )
        mx.save_model(fd_dir / "model_xgb.json")
        oof_preds["xgb"][va_idx] = mx.predict(dva)

        # 2. CatBoost
        mc = train_catboost_fold(X_tr, y_tr, X_va, y_va, sw)
        mc.save_model(str(fd_dir / "model_cat.cbm"))
        oof_preds["cat"][va_idx] = mc.predict_proba(X_va)[:,1]

        # 3. LightGBM
        ml = train_lightgbm_fold(X_tr, y_tr, X_va, y_va, sw)
        ml.save_model(fd_dir / "model_lgb.txt")
        oof_preds["lgb"][va_idx] = ml.predict(X_va)
        folds[va_idx] = fold
        
        print("Done.")
        del X_tr, y_tr, X_va, y_va, dtr, dva, mx, mc, ml
        gc.collect()

    print("   Optimizing Weights...", end=" ")
    weights, th = optimize_ensemble_weights(oof_preds, y)
    with open(res_dir / "ensemble_params.json", "w") as f: json.dump({"weights": weights, "threshold": th}, f)
    
    final_pred = sum(oof_preds[m] * weights[m] for m in weights)
    final_lbl = (final_pred >= th).astype("int8")
    
    # Save OOF
    df = indices.copy(); df["fold"] = folds; df["pred"] = final_pred; df["lbl"] = final_lbl
    df.to_parquet(res_dir / "oof.parquet", index=False)
    
    f1 = f1_score(y, final_lbl, zero_division=0)
    print(f"Best F1: {f1:.4f} (Th={th:.2f}, W={weights})")
    (res_dir / "f1.txt").write_text(f"{f1:.6f}")
    return float(f1)

# =========================================================
# 4. INFERENCE
# =========================================================

def load_ensemble_models(lab_id, behavior):
    base = RESULTS_DIR / lab_id / behavior
    if not base.exists(): return []
    models = []
    for fd in sorted(base.glob("fold_*")):
        if not (fd / "model_xgb.json").exists(): continue
        
        xgb_b = xgb.Booster(); xgb_b.load_model(str(fd / "model_xgb.json"))
        cat_m = cb.CatBoostClassifier(); 
        try: cat_m.load_model(str(fd / "model_cat.cbm"))
        except: cat_m = None
        try: lgb_m = lgb.Booster(model_file=str(fd / "model_lgb.txt"))
        except: lgb_m = None
        models.append({"xgb": xgb_b, "cat": cat_m, "lgb": lgb_m})
    return models

def predict_behaviors_for_pair(lab_id, video_id, aid, tid, behaviors, test_meta):
    if lab_id != "TranquilPanther": return None
    frames, feat_df = get_frame_features_no_cache(lab_id, video_id, aid, tid, test_meta, is_test=True)
    if feat_df.empty: return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])
    
    scores = {}
    for bhv in behaviors:
        base = RESULTS_DIR / lab_id / bhv
        if not (base / "ensemble_params.json").exists(): continue
        with open(base / "ensemble_params.json") as f: p = json.load(f)
        ws, th = p["weights"], p["threshold"]
        
        folds = load_ensemble_models(lab_id, bhv)
        if not folds: continue
        
        cols = folds[0]["xgb"].feature_names
        X = pd.DataFrame(0.0, index=feat_df.index, columns=cols, dtype=np.float32)
        c = list(set(cols) & set(feat_df.columns))
        if c: X[c] = feat_df[c]
        dtest = xgb.DMatrix(X, feature_names=cols)
        
        agg = np.zeros(len(feat_df), dtype=np.float32)
        for m in folds:
            px = m["xgb"].predict(dtest)
            pc = m["cat"].predict_proba(X)[:,1] if m["cat"] else np.zeros_like(px)
            pl = m["lgb"].predict(X) if m["lgb"] else np.zeros_like(px)
            
            avg = px*ws.get("xgb", 0.33) + pc*ws.get("cat", 0.33) + pl*ws.get("lgb", 0.33)
            agg += avg * (avg >= th).astype("int8")
        
        if folds: scores[bhv] = agg / len(folds)
        
        del X, dtest
        gc.collect()

    if not scores: return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])
    
    bl = list(scores.keys()); mat = np.vstack([scores[b] for b in bl]).T
    lbls = np.where(mat.max(1)==0, "none", np.array(bl)[mat.argmax(1)])
    
    segs = []; prev = "none"; start = None; pf = None
    for f, l in zip(frames, lbls):
        if l != prev:
            if prev != "none": segs.append({"video_id": int(video_id), "action": prev, "start_frame": int(start), "stop_frame": int(pf)+1})
            prev = l; start = f
        pf = f
    if prev != "none": segs.append({"video_id": int(video_id), "action": prev, "start_frame": int(start), "stop_frame": int(pf)+1})
    
    return pd.DataFrame(segs)

# =========================================================
# 5. MAIN
# =========================================================
target_lab = "TranquilPanther"
print("\n=== START INFERENCE ===")
test_meta = pd.read_csv(INPUT_DIR / "test.csv")
test_meta = test_meta[test_meta["lab_id"] == target_lab].reset_index(drop=True)

trained = sorted([p.name for p in (RESULTS_DIR/target_lab).iterdir() if p.is_dir()])
sb, pb = [b for b in trained if b in SELF_BEHAVIORS], [b for b in trained if b in PAIR_BEHAVIORS]

all_segs = []
def fid(i): return str(i) if str(i).startswith("mouse") else f"mouse{i}"

for vid in sorted(test_meta["video_id"].unique()):
    print(f"Predicting Video {vid}...")
    tr = load_tracking(target_lab, vid, is_test=True)
    mids = sorted(tr["mouse_id"].unique())
    
    if sb:
        for m in mids:
            df = predict_behaviors_for_pair(target_lab, vid, m, m, sb, test_meta)
            if df is not None and not df.empty:
                df["agent_id"] = fid(m); df["target_id"] = "self"
                all_segs.append(df)
    if pb and len(mids) > 1:
        for a, t in itertools.permutations(mids, 2):
            df = predict_behaviors_for_pair(target_lab, vid, a, t, pb, test_meta)
            if df is not None and not df.empty:
                df["agent_id"] = fid(a); df["target_id"] = fid(t)
                all_segs.append(df)
    del tr
    gc.collect()

cols = ["video_id", "agent_id", "target_id", "action", "start_frame", "stop_frame"]

if all_segs:
    sub4 = pd.concat(all_segs, ignore_index=True)
    sub4 = sub4[cols].sort_values(["video_id", "agent_id", "target_id", "action", "start_frame"]).reset_index(drop=True)    
    sub4.insert(0, "row_id", np.arange(len(sub4), dtype=np.int64))
else:
    sub4 = pd.DataFrame(columns=["row_id"] + cols)

sub4.to_csv(WORKING_DIR / "submission8.csv", index=False)
print(f"\nDone! Saved submission to {WORKING_DIR / 'submission8.csv'}")



=== START INFERENCE ===

Done! Saved submission to /kaggle/working/submission8.csv


# NiftyGoldfinch

In [16]:
from pathlib import Path
import shutil
import gc

WORKING_DIR = Path("/kaggle/working")

# 1) Xóa mọi thứ trong /kaggle/working trừ .csv
for path in WORKING_DIR.iterdir():
    # giữ lại file .csv
    if path.is_file() and path.suffix == ".csv":
        continue

    if path.is_file():
        try:
            path.unlink()
        except Exception as e:
            print(f"Cannot remove file {path}: {e}")
    elif path.is_dir():
        try:
            shutil.rmtree(path, ignore_errors=True)
        except Exception as e:
            print(f"Cannot remove dir {path}: {e}")


gc.collect()

78

In [17]:
from __future__ import annotations
from typing import Dict, List, Tuple, Any, Optional
import warnings
from dataclasses import dataclass, field
import numpy as np
import pandas as pd
from scipy.ndimage import gaussian_filter1d
from tqdm import tqdm

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
np.seterr(invalid="ignore", divide="ignore")

# =============================================================================
# 1. CONFIGURATION
# =============================================================================
@dataclass
class FeatureConfig:
    """
    Chứa cấu hình tham số (Hyperparameters).
    """
    fps: float = 30.0
    pix_per_cm: float = 1.0
    smooth_sigma: float = 1.0
    use_pairwise: bool = True


# =============================================================================
# 2. AGENT CONTEXT
# =============================================================================
@dataclass
class AgentContext:
    """
    Container chứa dữ liệu đã tiền xử lý của một con chuột.
    Giúp tránh việc tính toán lại vận tốc/gia tốc nhiều lần.
    """
    idx: pd.Index          # Index frame
    pos: np.ndarray        # [F, 2] cm
    vel: np.ndarray        # [F, 2] cm/s
    speed: np.ndarray      # [F, 1] cm/s
    acc: np.ndarray        # [F, 2] cm/s^2
    
    cx: pd.Series          # Series tọa độ X (để dùng rolling)
    cy: pd.Series          # Series tọa độ Y
    speed_series: pd.Series # Series tốc độ
    
    raw_df: Optional[pd.DataFrame] = None # Dữ liệu gốc các bộ phận 


# =============================================================================
# 3. FEATURE EXTRACTOR
# =============================================================================
class FeatureExtractor:
    """
    Class trích xuất đặc trưng hành vi từ dữ liệu tracking.
    """
    def __init__(self, fps: float, pix_per_cm: float, smooth_sigma: float = 1.0, use_pairwise: bool = True):
        # Map tham số từ init vào Config
        self.cfg = FeatureConfig(
            fps=float(fps), 
            pix_per_cm=float(pix_per_cm), 
            smooth_sigma=smooth_sigma,
            use_pairwise=use_pairwise
        )
        
        # Đăng ký các hàm feature sẽ chạy
        self.feature_registry = {
            "kinematics": self._feat_basic_kinematics,
            "multiscale": self._feat_multiscale,
            "long_range": self._feat_long_range,
            "cumulative": self._feat_cumulative,
            "curvature": self._feat_curvature,
            "speed_asym": self._feat_speed_asym,
            "gauss_shift": self._feat_gauss_shift,
            "pose_shape": self._feat_pose_shape,
            "pairwise": self._feat_pairwise,
            "follow": self._feat_follow_pattern,
            "short": self._feat_shortburst_social,
            "a": self._feat_attack_sniff,
            "b": self._feat_climb
        }

    # --- Helpers ---
    def _scale(self, n_frames_30fps: int) -> int:
        """Quy đổi số frame từ chuẩn 30fps sang fps thực tế của video."""
        return max(1, int(round(n_frames_30fps * self.cfg.fps / 30.0)))

    def _to_cm(self, arr):
        """Chuyển pixel -> cm."""
        return arr / self.cfg.pix_per_cm

    def _smooth(self, x):
        """Làm mượt dữ liệu bằng Gaussian filter."""
        if self.cfg.smooth_sigma is None or x.shape[0] < 3: return x
        if np.all(np.isnan(x)): return x
        return gaussian_filter1d(x, sigma=self.cfg.smooth_sigma, axis=0, mode="nearest")

    def _forward_fill_nan(self, pos):
        """
        Điền dữ liệu thiếu (NaN) bằng giá trị hợp lệ trước đó (Forward Fill).
        """
        if np.all(np.isnan(pos)):
            return np.zeros_like(pos)

        pos_ffill = pos.copy()
        mask = np.any(~np.isnan(pos_ffill), axis=1)
        if not mask.any():
            return np.zeros_like(pos_ffill)

        valid_idx = np.where(mask)[0]
        first, last = valid_idx[0], valid_idx[-1]
        pos_ffill[:first] = pos_ffill[first]
        pos_ffill[last + 1:] = pos_ffill[last]
        df_temp = pd.DataFrame(pos_ffill)
        df_temp = df_temp.ffill()
        return df_temp.to_numpy()
    
    def _speed_series(self, cx: pd.Series, cy: pd.Series) -> pd.Series:
        dx = cx.diff()
        dy = cy.diff()
        v = np.hypot(dx, dy).fillna(0.0) * self.cfg.fps
        return v.astype("float32")
    
    def _roll_future_mean(self, s: pd.Series, w: int, min_p: int = 1) -> pd.Series:
        return s.iloc[::-1].rolling(w, min_periods=min_p).mean().iloc[::-1]

    def _roll_future_var(self, s: pd.Series, w: int, min_p: int = 2) -> pd.Series:
        return s.iloc[::-1].rolling(w, min_periods=min_p).var().iloc[::-1]

    # --- Core Logic ---
    def _compute_kinematics(self, pos_px: np.ndarray):
        """
        Tính toán vật lý cơ bản: Pos(cm), Vel, Speed, Acc.
        Input: Array [Frames, 2] (pixel).
        Output: Tuple (pos_cm, vel, speed, acc).
        """
        pos_ffill = self._forward_fill_nan(pos_px)
        pos_cm = self._to_cm(pos_ffill.astype(np.float32))
        pos_cm = self._smooth(pos_cm)                                               # [F, 2]

        dt = 1.0 / self.cfg.fps
        vel = np.zeros_like(pos_cm, dtype=np.float32)
        vel[1:] = (pos_cm[1:] - pos_cm[:-1]) / dt                                   # [F, 2: (vx, vy)]
        speed = np.linalg.norm(vel, axis=1, keepdims=True).astype(np.float32)       # [F, 1]

        acc = np.zeros_like(pos_cm, dtype=np.float32)                          
        acc[1:] = (vel[1:] - vel[:-1]) / dt                                         # [F, 2:(ax, ay)]
        return pos_cm.astype(np.float32), vel, speed, acc

    def _build_context(self, frames, pos_px, mouse_df=None) -> AgentContext:
        """
        Tạo AgentContext chứa đầy đủ thông tin vật lý của 1 con chuột.
        """
        p, v, s, a = self._compute_kinematics(pos_px)
        idx = pd.Index(frames, name="frame")
        
        return AgentContext(
            idx=idx, pos=p, vel=v, speed=s, acc=a, 
            cx=pd.Series(p[:, 0], index=idx), 
            cy=pd.Series(p[:, 1], index=idx), 
            speed_series=pd.Series(s[:, 0], index=idx), 
            raw_df=mouse_df
        )

    # --- Feature Modules ---
    def _feat_basic_kinematics(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Lấy các giá trị thô: tọa độ x, y, vận tốc vx, vy, tốc độ, gia tốc ax, ay.
        """
        return {
            "a_x": ctx.pos[:, 0], "a_y": ctx.pos[:, 1],
            "a_vx": ctx.vel[:, 0], "a_vy": ctx.vel[:, 1],
            "a_speed": ctx.speed[:, 0],
            "a_ax": ctx.acc[:, 0], "a_ay": ctx.acc[:, 1]
        }

    def _feat_multiscale(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Tính tốc độ trung bình (Mean) và độ lệch chuẩn (Std) ở đa mức thời gian.
        Feature 'sp_ratio' đo độ bùng nổ (Burstiness).
        """
        feats = {}
        speed = ctx.speed_series
        frame_scales = [10, 40, 160]
        for scale in frame_scales:
            ws = self._scale(scale)
            if len(speed) >= ws:
                roller = speed.rolling(ws, min_periods=max(1, ws//4), center=True)
                feats[f"sp_m{scale}"] = roller.mean().astype("float32")
                feats[f"sp_s{scale}"] = roller.std().astype("float32")
        feats[f"sp_ratio"] = feats["sp_m10"] / (feats["sp_m160"] + 1e-6)
        return feats 
        
    def _feat_long_range(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Đặc trưng ngữ cảnh dài hạn:
        - x_ml, y_ml: Vị trí trung bình trong quá khứ.
        - sp_pct: Xếp hạng (percentile) của tốc độ hiện tại so với quá khứ.
        """
        feats: Dict[str, pd.Series] = {}
        speed = ctx.speed_series

        for window in [120, 240]:
            ws = self._scale(window)
            if len(ctx.cx) >= ws:
                feats[f"x_ml{window}"] = ctx.cx.rolling(ws, min_periods=max(5, ws // 6), center=True).mean()
                feats[f"y_ml{window}"] = ctx.cy.rolling(ws, min_periods=max(5, ws // 6), center=True).mean()

        for span in [60, 120]:
            s = self._scale(span)
            feats[f"x_e{span}"] = ctx.cx.ewm(span=s, min_periods=1).mean()
            feats[f"y_e{span}"] = ctx.cy.ewm(span=s, min_periods=1).mean()

        for window in [60, 120]:
            ws = self._scale(window)
            if len(speed) >= ws:
                feats[f"sp_pct{window}"] = speed.rolling(
                    ws, min_periods=max(5, ws // 6), center=True
                ).rank(pct=True)
        return feats
    

    def _feat_curvature(self, ctx: AgentContext, **kwargs) -> Dict:
        feats = {}

        vel_x, vel_y = ctx.vel[:, 0], ctx.vel[:, 1]
        acc_x, acc_y = ctx.acc[:, 0], ctx.acc[:, 1]
        cross_prod = vel_x * acc_y - vel_y * acc_x
        vel_mag = np.sqrt(vel_x**2 + vel_y**2)
        moving_mask = vel_mag > 2.0
        vel_mag_safe = np.maximum(vel_mag, 0.1 / self.cfg.fps)
        raw_curv = cross_prod / (vel_mag_safe**3)
        raw_curv = np.where(moving_mask, raw_curv, 0.0)
        min_turn_radius_cm = 0.5
        max_k = 1.0 / min_turn_radius_cm
        raw_curv = np.clip(raw_curv, -max_k, max_k)
        abs_curv = np.abs(raw_curv)
        abs_curv_series = pd.Series(abs_curv, index=ctx.idx)

        for w in [30, 60]:
            ws = self._scale(w)
            min_p = max(ws // 3, 1)
            feats[f"curv_mean_{w}"] = abs_curv_series.rolling(ws, min_periods=min_p).mean()

        angle = np.arctan2(vel_y, vel_x)
        angle_series = pd.Series(angle, index=ctx.idx)
        angle_change = np.abs(angle_series.diff().fillna(0.0))
        angle_change = np.where(angle_change > np.pi, 2 * np.pi - angle_change, angle_change)
        angle_change_series = pd.Series(angle_change, index=ctx.idx)
        angle_change_series = pd.Series(np.where(moving_mask, angle_change_series, 0.0), index=ctx.idx)

        ws = self._scale(30)
        feats["turn_rate_30"] = angle_change_series.rolling(ws, min_periods=max(ws // 3, 1)).sum()

        return feats
    
    def _feat_cumulative(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Tổng quãng đường di chuyển trong một khoảng thời gian dài xung quanh frame hiện tại.
        """
        feats = {}
        L = max(1, self._scale(180))
        step = np.hypot(ctx.cx.diff(), ctx.cy.diff()).fillna(0.0)
        path = step.rolling(2 * L + 1, min_periods=max(5, L // 6), center=True).sum()
        feats["path_cum180"] =  path.fillna(0.0).astype("float32")
        return feats

    def _feat_speed_asym(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Bất đối xứng tốc độ (Tương lai - Quá khứ).
        """
        w = max(3, self._scale(30))
        v = ctx.speed_series
        v_past = v.rolling(w, min_periods=1).mean()
        v_fut = self._roll_future_mean(v, w, min_p=1)
        return {"spd_asym_1s": (v_fut - v_past).fillna(0.0)}
    
    def _feat_gauss_shift(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Độ lệch Gaussian (KL Divergence) giữa quá khứ và tương lai.
        Đo lường sự thay đổi trạng thái thống kê.
        """
        w = max(5, self._scale(30))
        v = ctx.speed_series
        mu_p = v.rolling(w, min_periods=1).mean()
        va_p = v.rolling(w, min_periods=1).var().clip(lower=1e-6)
        mu_f = self._roll_future_mean(v, w, min_p=1)
        va_f = self._roll_future_var(v, w, min_p=1).clip(lower=1e-6)

        kl_pf = 0.5 * (
            (va_p / va_f) + ((mu_f - mu_p) ** 2) / va_f - 1.0 + np.log(va_f / va_p)
        )
        kl_fp = 0.5 * (
            (va_f / va_p) + ((mu_p - mu_f) ** 2) / va_p - 1.0 + np.log(va_p / va_f)
        )
        return {
            "spd_symkl_1s": (kl_pf + kl_fp).replace([np.inf, -np.inf], np.nan).fillna(0.0)
        }
    
    def _extract_part(self, ctx: AgentContext, part: str) -> Optional[np.ndarray]:
        if ctx.raw_df is None: return None
        if part not in ctx.raw_df.columns.get_level_values(0): return None
        try:
            sub_df = ctx.raw_df.xs(part, axis=1, level=0)[["x", "y"]].reindex(ctx.idx)
        except KeyError: return None
        raw = sub_df.to_numpy()
        raw = self._forward_fill_nan(raw)
        cm = self._to_cm(raw.astype(np.float32))
        return self._smooth(cm)
    
    def _extract_parts_dict(self, ctx: AgentContext, parts: List[str] = None) -> Dict[str, Optional[np.ndarray]]:
        out = {}
        for p in parts:
            out[p] = self._extract_part(ctx, p)
        return out
        
    def _feat_pose_shape(self, ctx: AgentContext, **kwargs) -> Dict:
        """
        Placeholder cho các đặc trưng hình dáng (Elongation, Body Angle...).
        """
        feats = {}

        def zero(): return pd.Series(0.0, index=ctx.idx, dtype="float32")

        def dist(k1, k2):
            p1, p2 = parts.get(k1), parts.get(k2)
            if p1 is None or p2 is None: return zero()
            d = np.linalg.norm(p1 - p2, axis=1)
            return pd.Series(d, index=ctx.idx, dtype="float32")
        
        def body_angle():
            if parts.get("nose") is None: return zero()
            if parts.get("body_center") is None: return zero()
            if parts.get("tail_base") is None: return zero()

            v1 = parts.get("nose") - parts.get("body_center")
            v2 = parts.get("tail_base") - parts.get("body_center")
            dot_product = np.sum(v1 * v2, axis=1)
            mag = np.linalg.norm(v1, axis=1) * np.linalg.norm(v2, axis=1)
            cos_angle = np.clip(dot_product / (mag + 1e-6), -1.0, 1.0).astype("float32")
            return cos_angle
        
        def elongation():
            if parts.get("nose")          is None: return zero()
            if parts.get("tail_base")     is None: return zero()
            if parts.get("ear_left")  is None: return zero()
            if parts.get("ear_right") is None: return zero()

            d1 = dist("nose", "tail_base")
            d2 = dist("ear_left", "ear_right")
            elongation = d1 / (d2 + 1e-6).astype("float32")
            return elongation
        
        def part_speed(part: str, n_frames_30fps: int) -> Dict:
            part_pos = self._extract_part(ctx, part)
            if part_pos is None: return zero()
            
            s_x = pd.Series(part_pos[:, 0], index=ctx.idx)
            s_y = pd.Series(part_pos[:, 1], index=ctx.idx)
            raw_speed = self._speed_series(s_x, s_y)

            ws = self._scale(n_frames_30fps)
            val = raw_speed.rolling(ws, min_periods=1, center=True).mean()
            return val.astype("float32")


        target_parts = ["nose", "body_center", "ear_left", "ear_right", "tail_base"]
        
        parts = self._extract_parts_dict(ctx, target_parts)

        feats["aa_nose_tailbase_dist"]       = dist("nose", "tail_base")
        feats["aa_earleft_tailbase_dist"]    = dist("ear_left", "tail_base")
        feats["aa_earright_tailbase_dist"]   = dist("ear_right", "tail_base")
        feats["aa_nose_earleft_dist"]        = dist("ear_left", "nose")
        feats["aa_nose_ear_right_dist"]      = dist("ear_right", "nose")
        feats["aa_nose_bodycenter_dist"]     = dist("nose", "body_center")
        
        feats["a_elongation"]                = elongation()
        feats["a_bodyangle"]                 = body_angle()
        feats["a_tail_base_vel_500ms"]       = part_speed("tail_base", 15)
        feats["a_tail_base_vel_1000ms"]      = part_speed("tail_base", 30)
        feats["a_tail_base_vel_2000ms"]      = part_speed("tail_base", 60)
        feats["a_tail_base_vel_3000ms"]      = part_speed("tail_base", 90)

        feats["a_nose_vel_500ms"]            = part_speed("nose", 15)
        feats["a_nose_vel_1000ms"]           = part_speed("nose", 30)
        feats["a_nose_vel_2000ms"]           = part_speed("nose", 60)
        feats["a_nose_vel_3000ms"]           = part_speed("nose", 90)

        feats["a_ear_right_vel_500ms"]       = part_speed("ear_right", 15)
        feats["a_ear_right_vel_1000ms"]      = part_speed("ear_right", 30)
        feats["a_ear_right_vel_2000ms"]      = part_speed("ear_right", 60)
        feats["a_ear_right_vel_3000ms"]      = part_speed("ear_right", 90)
        # feats["a_ear_left_vel_500ms"]        = part_speed("ear_left", 15)
        # feats["a_ear_left_vel_1000ms"]       = part_speed("ear_left", 30)
        # feats["a_ear_left_vel_2000ms"]       = part_speed("ear_left", 60)
        # feats["a_ear_left_vel_3000ms"]       = part_speed("ear_left", 90)
        
        return feats

    def _feat_attack_sniff(
        self,
        ctx: AgentContext,
        target_ctx: AgentContext = None,
        **kwargs
    ) -> Dict[str, pd.Series]:
        """
        Đặc trưng phân biệt attack vs sniff cho lab 2-mouse (agent=1, target=2).
    
        Ý tưởng:
          - attack: speed 2 con biến động mạnh, đổi hướng nhiều, body overlap cao.
          - sniff : mũi gần cổ/thân, overlap thấp hơn, motion nhẹ/ổn định hơn.
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
    
        def zero():
            return pd.Series(0.0, index=idx, dtype="float32")

        # helper khoảng cách
        def dist(p1, p2):
            if p1 is None or p2 is None:
                return zero()
            d = np.linalg.norm(p1 - p2, axis=1)
            return pd.Series(d, index=idx, dtype="float32")

        parts_a = self._extract_parts_dict(ctx, ["nose", "tail_base"])
        parts_t = self._extract_parts_dict(target_ctx, ["nose", "tail_base"])
    
        # ---------------------------------------------------------
        # 2) ĐIỂM ĐẠI DIỆN THÂN (BODY CENTER) CHO MỖI CON
        #    dùng trung bình neck – hips – tail_base
        # ---------------------------------------------------------
    
        # ---------------------------------------------------------
        # 4) MỨC ĐỘ “BẠO LỰC”: DAO ĐỘNG TỐC ĐỘ & ĐỔI HƯỚNG
        # ---------------------------------------------------------
        # speed 2 con từ velocity
        a_speed = pd.Series(
            np.linalg.norm(ctx.vel, axis=1),
            index=idx,
            dtype="float32",
        )
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32",
        )

        ws_05 = self._scale(15)  # ~0.5s
        mp_05 = max(ws_05 // 3, 1)
    
        feats["as_a_speed_std_05"] = (
            a_speed.rolling(ws_05, min_periods=mp_05).std().fillna(0.0).astype("float32")
        )
        feats["as_t_speed_std_05"] = (
            t_speed.rolling(ws_05, min_periods=mp_05).std().fillna(0.0).astype("float32")
        )
        feats["as_speed_std_sum_05"] = (
            feats["as_a_speed_std_05"] + feats["as_t_speed_std_05"]
        )
    
        # Đổi hướng (jerk góc) của agent
        a_angle = np.arctan2(ctx.vel[:, 1], ctx.vel[:, 0])
        a_angle_diff = np.abs(np.diff(a_angle))
        a_angle_diff = np.where(
            a_angle_diff > np.pi, 2 * np.pi - a_angle_diff, a_angle_diff
        )
        a_angle_diff = np.concatenate([[0.0], a_angle_diff])
        a_angle_diff_s = pd.Series(a_angle_diff, index=idx, dtype="float32")
    
        feats["as_a_turn_jerk_05"] = (
            a_angle_diff_s.rolling(ws_05, min_periods=mp_05)
            .sum()
            .fillna(0.0)
            .astype("float32")
        )

        # ---------------------------------------------------------
        # 5) XẤP XỈ OVERLAP CƠ THỂ (BODY OVERLAP)
        #    dùng bbox từ các bộ phận thân
        # ---------------------------------------------------------
        def build_bbox(parts: Dict[str, Optional[np.ndarray]]):
            arrs = []
            for k in ["nose", "body_center", "ear_left", "ear_right", "tail_base"]:
                if parts.get(k) is not None:
                    arrs.append(parts[k])
            if not arrs:
                return None
            stack = np.stack(arrs, axis=1)  # [F, K, 2]
            xs = stack[:, :, 0]
            ys = stack[:, :, 1]
            xmin = np.nanmin(xs, axis=1)
            xmax = np.nanmax(xs, axis=1)
            ymin = np.nanmin(ys, axis=1)
            ymax = np.nanmax(ys, axis=1)
            return np.stack([xmin, ymin, xmax, ymax], axis=1).astype("float32")
    
        def iou_box(box1: np.ndarray, box2: np.ndarray):
            # box: [F, 4] = (xmin, ymin, xmax, ymax)
            x1 = np.maximum(box1[:, 0], box2[:, 0])
            y1 = np.maximum(box1[:, 1], box2[:, 1])
            x2 = np.minimum(box1[:, 2], box2[:, 2])
            y2 = np.minimum(box1[:, 3], box2[:, 3])
    
            inter_w = np.clip(x2 - x1, 0.0, None)
            inter_h = np.clip(y2 - y1, 0.0, None)
            inter = inter_w * inter_h
    
            area1 = (box1[:, 2] - box1[:, 0]) * (box1[:, 3] - box1[:, 1])
            area2 = (box2[:, 2] - box2[:, 0]) * (box2[:, 3] - box2[:, 1])
            union = area1 + area2 - inter + 1e-6
            iou = inter / union
            return iou.astype("float32")

        bbox_a = build_bbox(parts_a)
        bbox_t = build_bbox(parts_t)
        if bbox_a is not None and bbox_t is not None:
            iou = iou_box(bbox_a, bbox_t)
            iou_s = pd.Series(iou, index=idx, dtype="float32")
    
            feats["as_body_iou"] = iou_s
    
            ws_1s = self._scale(30)
            mp_1s = max(ws_1s // 3, 1)
            feats["as_body_iou_mean_1s"] = (
                iou_s.rolling(ws_1s, min_periods=mp_1s).mean().fillna(0.0).astype("float32")
            )
        else:
            feats["as_body_iou"] = zero()
            feats["as_body_iou_mean_1s"] = zero()
    
        # ---------------------------------------------------------
        # 6) DỌN NẠN NaN / Inf
        # ---------------------------------------------------------
        for k, v in feats.items():
            feats[k] = (
                v.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats

    def _feat_climb(self, ctx: AgentContext, **kwargs) -> Dict[str, pd.Series]:
        """
        Feature chuyên cho hành vi climb trong arena hình chữ nhật (33 x 19 cm).
    
        Ý tưởng:
          - Chuột đi gần tường: dist_wall giảm nhanh.
          - Khi climb: sát tường (dist_wall nhỏ), v_normal ~ 0,
            nhưng vẫn có v_tangent (bò ngang trên tường / di chuyển dọc biên).
        """
        feats: Dict[str, pd.Series] = {}
        idx = ctx.idx
    
        def zero() -> pd.Series:
            return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- 1. Arena size (cm) ---
        # Nếu bạn đã set trong FeatureConfig thì dùng:
        # W = self.cfg.arena_width_cm or 33.0
        # H = self.cfg.arena_height_cm or 19.0
        # Ở đây fix luôn cho lab này:
        W = 33.0
        H = 19.0
        parts = self._extract_parts_dict(ctx, ["nose"])
        head = parts.get("nose")
        
        if head is not None:
            # head đã ở đơn vị cm (vì _extract_part đã to_cm + smooth)
            cx = pd.Series(head[:, 0], index=idx)
            cy = pd.Series(head[:, 1], index=idx)
        else:
            # fallback: nếu không có head thì dùng body_center như cũ
            cx = ctx.cx
            cy = ctx.cy


        # # --- 2. Khoảng cách tới 4 bức tường ---
        # cx = ctx.cx  # Series
        # cy = ctx.cy  # Series
    
        dist_left   = cx - 0.0
        dist_right  = W - cx
        dist_bottom = cy - 0.0
        dist_top    = H - cy
    
        d_all = np.stack(
            [dist_left.values, dist_right.values, dist_bottom.values, dist_top.values],
            axis=1,  # [F, 4]
        )
    
        dist_wall = np.min(d_all, axis=1)          # khoảng cách tới tường gần nhất
        wall_idx  = np.argmin(d_all, axis=1)       # 0:left, 1:right, 2:bottom, 3:top
    
        dist_wall_s = pd.Series(dist_wall, index=idx, dtype="float32")
        feats["climb_dist_wall"] = dist_wall_s
    
        # --- 3. Vận tốc theo NORMAL & TANGENT của tường gần nhất ---
        vx = ctx.vel[:, 0]
        vy = ctx.vel[:, 1]
    
        # normal hướng VÀO trong arena từ tường
        nx = np.zeros_like(vx, dtype="float32")
        ny = np.zeros_like(vy, dtype="float32")

        # left  wall (x=0)    → normal = (+1, 0)
        # right wall (x=W)    → normal = (-1, 0)
        # bottom wall (y=0)   → normal = (0, +1)
        # top wall (y=H)      → normal = (0, -1)
        nx[wall_idx == 0] =  1.0
        nx[wall_idx == 1] = -1.0
        ny[wall_idx == 2] =  1.0
        ny[wall_idx == 3] = -1.0
    
        # v_normal = v ⋅ n
        v_normal = vx * nx + vy * ny
    
        # thành phần song song tường: v_tan = v - (v⋅n)n
        v_proj_x = v_normal * nx
        v_proj_y = v_normal * ny
        v_tan_x = vx - v_proj_x
        v_tan_y = vy - v_proj_y
        v_tangent = np.sqrt(v_tan_x ** 2 + v_tan_y ** 2)
    
        v_normal_s  = pd.Series(v_normal,  index=idx, dtype="float32")
        v_tangent_s = pd.Series(v_tangent, index=idx, dtype="float32")
    
        feats["climb_normal_vel"]  = v_normal_s
        feats["climb_tangent_vel"] = v_tangent_s
    
        # --- 4. Approach speed: dist_wall giảm mạnh (lao vào tường) ---
        ws = self._scale(15)  # ~0.5s (15 frame ở 30fps)
        min_p = max(ws // 3, 1)

        # diff_dw > 0 khi dist_wall giảm (đi về phía tường)
        diff_dw = -dist_wall_s.diff().fillna(0.0)  # dấu trừ để "giảm" → dương
        approach = diff_dw.rolling(ws, min_periods=min_p).mean()
        feats["climb_approach_speed_wall"] = approach.astype("float32")
    
        # --- 5. Stick score: sát tường + không còn lao vào (v_normal nhỏ) ---
        # gần tường
        thr_cm = 3.0  # tuỳ chỉnh (3cm sát tường)
        near_wall = (dist_wall_s < thr_cm).astype("float32")
    
        # ít lao vào nữa: |v_normal| nhỏ
        stick = near_wall * (1.0 / (1.0 + v_normal_s.abs()))

        # Nếu muốn climb thực sự có chút chuyển động dọc tường:
        # yêu cầu v_tangent > một ngưỡng nhỏ (ví dụ 0.5 cm/s)
        stick = stick * (v_tangent_s > 0.5).astype("float32")
    
        feats["climb_wall_stick_score"] = stick.astype("float32")
    
        # --- 6. Clean NaN/Inf ---
        for k, v in feats.items():
            feats[k] = (
                v.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats


    def _feat_pairwise(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict:
        """
        Đặc trưng tương tác cặp đôi (Pairwise): Khoảng cách, Tốc độ tiếp cận.
        """
        feats = {}
        if target_ctx is None: 
            return feats

        def zero(): return pd.Series(0.0, index=ctx.idx, dtype="float32")

        def dist_ab(pt_a, pt_b):
            if pt_a is None or pt_b is None: return zero()
            d = np.linalg.norm(pt_a - pt_b, axis=1)
            return pd.Series(d, index=ctx.idx, dtype="float32")

        rel_vec = target_ctx.pos - ctx.pos
        dist = np.linalg.norm(rel_vec, axis=1)
        feats["rel_dist"] = pd.Series(dist, index=ctx.idx, dtype="float32")

        # Khoảng cách
        my_parts = self._extract_parts_dict(ctx, ["nose", "ear_left", "ear_right", "body_center", "tail_base"])
        target_parts = self._extract_parts_dict(target_ctx, ["nose", "body_center", "tail_base", "ear_left", "ear_right"])

        an, tn = my_parts["nose"], target_parts["nose"]
        feats["dist_nose_nose"] = dist_ab(an, tn)
        feats["dist_nose_tail"] = dist_ab(an, target_parts["tail_base"])
        feats["dist_nose_el"]   = dist_ab(an, target_parts["ear_left"])
        feats["dist_nose_er"]   = dist_ab(an, target_parts["ear_right"])
        feats["dist_nose_body"] = dist_ab(an, target_parts["body_center"])
        feats["dist_tail_tail"] = dist_ab(my_parts["tail_base"], target_parts["tail_base"])
        

        #  Hướng - góc nhìn
        def get_body_vec(parts_dict):
            head = parts_dict.get("nose")
            tail = parts_dict.get("tail_base")
            if head is not None and tail is not None:
                return head - tail
            return None

        a_vec = get_body_vec(my_parts)
        t_vec = get_body_vec(target_parts)

        if a_vec is not None and t_vec is not None:
            dot = np.sum(a_vec * t_vec, axis=1)
            mags = np.linalg.norm(a_vec, axis=1) * np.linalg.norm(t_vec, axis=1)
            feats["body_cosine"] = pd.Series(
                np.clip(dot / (mags + 1e-6), -1.0, 1.0), index=ctx.idx, dtype="float32"
            )
        else:
            feats["body_cosine"] = zero()

        # Vector ánh nhìn = Target_Pos - My_Pos = rel_vec
        if a_vec is not None:
            dot_gaze = np.sum(a_vec * rel_vec, axis=1)
            mag_a = np.linalg.norm(a_vec, axis=1)
            feats["gaze_cosine"] = pd.Series(
                np.clip(dot_gaze / (mag_a * dist + 1e-6), -1.0, 1.0),
                index=ctx.idx, dtype="float32"
            )
        else:
            feats["gaze_cosine"] = zero()

        # Vector đơn vị hướng về địch (u)
        dist_safe = dist.copy()
        dist_safe[dist_safe == 0] = 1e-6
        u_vec = rel_vec / dist_safe[:, None]

        # a_vel và t_vel lấy từ Context
        a_vel, t_vel = ctx.vel, target_ctx.vel

        # A. Approach Speed (Vận tốc dọc trục nối 2 con)
        # Dương: Lao vào nhau | Âm: Chạy ra xa nhau
        a_along = np.sum(a_vel * u_vec, axis=1)
        t_along = np.sum(t_vel * (-u_vec), axis=1) # Target hướng ngược lại
        rel_along = np.sum((a_vel - t_vel) * u_vec, axis=1)

        # B. Lateral Speed (Vận tốc ngang - Vuông góc trục nối)
        # Vector chiếu: v_proj = (v . u) * u
        a_proj = a_along[:, None] * u_vec
        a_lat_vec = a_vel - a_proj
        a_lat_speed = np.linalg.norm(a_lat_vec, axis=1)

        feats["approach_speed_agent"]  = pd.Series(a_along, index=ctx.idx, dtype="float32")
        feats["approach_speed_target"] = pd.Series(t_along, index=ctx.idx, dtype="float32")
        feats["approach_speed_rel"]    = pd.Series(rel_along, index=ctx.idx, dtype="float32")
        feats["lateral_speed_agent"]   = pd.Series(a_lat_speed, index=ctx.idx, dtype="float32")
        return feats

    def _feat_follow_pattern(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        """
        Đặc trưng hành vi FOLLOW:
          - Agent ở gần target
          - Cùng hướng (body + velocity)
          - Tốc độ vừa phải
          - Khoảng cách tương đối ổn định trong 0.5–1s
        """
        feats: Dict[str, pd.Series] = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- 1. CÁC ĐẠI LƯỢNG CƠ BẢN ---
        # Vector Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        rel_dist = np.linalg.norm(rel_vec, axis=1)
        rel_dist_s = pd.Series(rel_dist, index=idx, dtype="float32")
    
        # Speed agent/target
        a_speed = ctx.speed_series.astype("float32")
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32",
        )
    
        # Body vector: nose - tail/body_center
        parts_a = self._extract_parts_dict(ctx, ["nose", "tail_base", "ear_left", "ear_right"])
        parts_t = self._extract_parts_dict(target_ctx, ["nose", "tail_base", "ear_right", "ear_left"])
    
        def body_vec(parts_dict):
            head = parts_dict.get("nose")
            tail = parts_dict.get("tail_base")
            if head is None or tail is None:
                return None
            return head - tail
    
        a_body = body_vec(parts_a)
        t_body = body_vec(parts_t)
    
        if a_body is not None and t_body is not None:
            dot_bt = np.sum(a_body * t_body, axis=1)
            mag_bt = np.linalg.norm(a_body, axis=1) * np.linalg.norm(t_body, axis=1)
            cos_body = np.clip(dot_bt / (mag_bt + 1e-6), -1.0, 1.0)
            cos_body_s = pd.Series(cos_body, index=idx, dtype="float32")
        else:
            cos_body_s = zero()
    
        # Velocity hướng
        a_vel = ctx.vel
        t_vel = target_ctx.vel
        a_speed_np = np.linalg.norm(a_vel, axis=1)
        t_speed_np = np.linalg.norm(t_vel, axis=1)
        moving_mask = (a_speed_np > 1e-3) & (t_speed_np > 1e-3)
    
        # cos giữa hướng velocity 2 con
        dot_v = np.sum(a_vel * t_vel, axis=1)
        mag_v = a_speed_np * t_speed_np + 1e-6
        cos_vel = np.zeros_like(dot_v, dtype="float32")
        cos_vel[moving_mask] = np.clip(dot_v[moving_mask] / mag_v[moving_mask], -1.0, 1.0)
        cos_vel_s = pd.Series(cos_vel, index=idx, dtype="float32")
    
        # --- 2. WINDOW NGẮN (FOLLOW LÀ PATTERN DÀI HƠN ATTACK) ---
        for w30 in [15, 30, 60]:   # ~0.5s, 1s, 2s
            ws = self._scale(w30)
            min_p = max(ws // 3, 1)
    
            # Khoảng cách trung bình & độ dao động
            m_dist = rel_dist_s.rolling(ws, min_periods=min_p).mean()
            s_dist = rel_dist_s.rolling(ws, min_periods=min_p).std()
    
            # Cùng hướng (body + velocity)
            m_cos_body = cos_body_s.rolling(ws, min_periods=min_p).mean()
            m_cos_vel  = cos_vel_s.rolling(ws, min_periods=min_p).mean()
    
            # Tốc độ vừa phải
            m_sp_a = a_speed.rolling(ws, min_periods=min_p).mean()
            m_sp_t = t_speed.rolling(ws, min_periods=min_p).mean()
    
            feats[f"follow_dist_mean_{w30}"] = m_dist
            feats[f"follow_dist_std_{w30}"]  = s_dist
            feats[f"follow_cos_body_mean_{w30}"] = m_cos_body
            feats[f"follow_cos_vel_mean_{w30}"]  = m_cos_vel
            feats[f"follow_speed_agent_mean_{w30}"] = m_sp_a
            feats[f"follow_speed_target_mean_{w30}"] = m_sp_t
    
        # Clean
        for k, v in feats.items():
            feats[k] = (
                v.replace([np.inf, -np.inf], np.nan)
                 .fillna(0.0)
                 .astype("float32")
            )
    
        return feats
    
    def _feat_shortburst_social(self, ctx: AgentContext, target_ctx: AgentContext = None, **kwargs) -> Dict[str, pd.Series]:
        feats = {}
        if target_ctx is None:
            return feats
    
        idx = ctx.idx
        def zero(): return pd.Series(0.0, index=idx, dtype="float32")
    
        # --- Lấy lại vài quantity cơ bản từ pairwise/avoidance ---
        # vector Agent -> Target
        rel_vec = target_ctx.pos - ctx.pos
        rel_dist = np.linalg.norm(rel_vec, axis=1)
        rel_dist_s = pd.Series(rel_dist, index=idx, dtype="float32")
    
        # unit vector
        rel_dist_safe = np.where(rel_dist == 0, 1e-6, rel_dist)
        u_vec = rel_vec / rel_dist_safe[:, None]
    
        # velocity dọc trục nối (approach speed)
        a_vel = ctx.vel
        t_vel = target_ctx.vel
        a_along = np.sum(a_vel * u_vec, axis=1)                # +: lao vào target
        t_along = np.sum(t_vel * (-u_vec), axis=1)             # +: target lao vào agent
        rel_along = np.sum((a_vel - t_vel) * u_vec, axis=1)    # +: lại gần nhau
    
        a_along_s = pd.Series(a_along, index=idx, dtype="float32")
        t_along_s = pd.Series(t_along, index=idx, dtype="float32")
        rel_along_s = pd.Series(rel_along, index=idx, dtype="float32")
    
        # speed agent / target
        a_speed = ctx.speed_series
        t_speed = pd.Series(
            np.linalg.norm(target_ctx.vel, axis=1),
            index=idx,
            dtype="float32"
        )
    
        # heading_rel_cos ~ escape / approach
        # vector body của agent
        # (reuse idea từ _feat_pairwise)
        # head ~ nose, tail ~ tail_base/body_center
        parts_a = self._extract_parts_dict(ctx, ["head", "tail_base"])
        head_a = parts_a.get("nose")
        tail_a = parts_a.get("tail_base")
    
        if head_a is not None and tail_a is not None:
            body_vec_a = head_a - tail_a
            dot = np.sum(body_vec_a * rel_vec, axis=1)
            mag = np.linalg.norm(body_vec_a, axis=1) * rel_dist_safe
            heading_cos = np.clip(dot / (mag + 1e-6), -1.0, 1.0)
            heading_cos_s = pd.Series(heading_cos, index=idx, dtype="float32")
        else:
            heading_cos_s = zero()
    
        # --- Rolling window 10, 20, 30 frames (ở fps gốc) ---
        for w30 in [10, 20, 30]:
            ws = self._scale(w30)
            min_p = max(1, ws // 3)
    
            # Attack-like: approach mạnh, khoảng cách giảm nhanh
            feats[f"sb_att_approach_mean_{w30}"] = a_along_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_att_rel_along_mean_{w30}"] = rel_along_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_att_dist_delta_{w30}"] = (rel_dist_s - rel_dist_s.shift(ws)).fillna(0.0)
    
            # Chase-like: agent & target đều nhanh, dist tương đối nhỏ
            feats[f"sb_chase_speed_agent_mean_{w30}"] = a_speed.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_chase_speed_target_mean_{w30}"] = t_speed.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_chase_dist_mean_{w30}"] = rel_dist_s.rolling(ws, min_periods=min_p).mean()
    
            # Escape-like: heading ngược, dist tăng nhanh
            feats[f"sb_esc_heading_cos_mean_{w30}"] = heading_cos_s.rolling(ws, min_periods=min_p).mean()
            feats[f"sb_esc_dist_gain_{w30}"] = (rel_dist_s.shift(-ws) - rel_dist_s).fillna(0.0)
    
        # clip & fillna
        for k, v in feats.items():
            feats[k] = v.replace([np.inf, -np.inf], np.nan).fillna(0.0).astype("float32")
    
        return feats



    # --- Methods tương thích ---
    
    def build_pose_tensor(self, tracking: pd.DataFrame):
        """
        Chuyển dữ liệu tracking (DataFrame) sang Tensor [Frames, Mice, 2] và Dict chi tiết.
        """
        tracking = tracking.sort_values("video_frame")
        frames = np.sort(tracking["video_frame"].unique())
        
        pvid = tracking.pivot(
            index="video_frame", 
            columns=["mouse_id", "bodypart"], 
            values=["x", "y"]
        )
        pvid = pvid.reorder_levels([1, 2, 0], axis=1).sort_index(axis=1).astype("float32")
        mouse_ids = list(pvid.columns.get_level_values(0).unique())
        pos = np.full((len(frames), len(mouse_ids), 2), np.nan, dtype=np.float32)
        per_mouse_df = {}
        
        for i, mid in enumerate(mouse_ids):
            single = pvid[mid]
            per_mouse_df[mid] = single
            
            if "body_center" in single.columns.get_level_values(0):
                cx = single["body_center"]["x"]
                cy = single["body_center"]["y"]
            else:
                cx = single.xs("x", level=1, axis=1).mean(axis=1)
                cy = single.xs("y", level=1, axis=1).mean(axis=1)
            
            pos[:, i, 0] = cx.reindex(frames).values
            pos[:, i, 1] = cy.reindex(frames).values
            
        return frames, mouse_ids, pos, per_mouse_df

    def extract_agent_target(
        self, 
        frames: np.ndarray, 
        mouse_ids: List[Any], 
        pos: np.ndarray, 
        agent_id: Any, 
        target_id: Any, 
        per_mouse_df: Dict = None
    ) -> pd.DataFrame:
        """
        Trích xuất đặc trưng cho cặp (Agent, Target).
        """
        try:
            aid_idx = mouse_ids.index(agent_id)
        except ValueError:
            return pd.DataFrame() 

        # 1. Build Agent Context
        ctx_agent = self._build_context(
            frames, 
            pos[:, aid_idx, :], 
            per_mouse_df.get(agent_id) if per_mouse_df else None
        )

        # 2. Build Target Context
        ctx_target = None
        if self.cfg.use_pairwise and target_id is not None and target_id in mouse_ids:
             tid_idx = mouse_ids.index(target_id)
             ctx_target = self._build_context(
                 frames, 
                 pos[:, tid_idx, :], 
                 per_mouse_df.get(target_id) if per_mouse_df else None
             )

        # 3. Run all features
        all_data = {}
        for func_name, func in self.feature_registry.items():
            out_dict = func(ctx_agent, target_ctx=ctx_target)
            all_data.update(out_dict)

        df_out = pd.DataFrame(all_data, index=ctx_agent.idx)
        df_out = df_out.replace([np.inf, -np.inf], np.nan).fillna(0.0)
        
        return df_out.reindex(sorted(df_out.columns), axis=1)


from __future__ import annotations
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import gc
import itertools
import json
import time
import warnings
import joblib
from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedGroupKFold

# === IMPORT MODEL & OPTUNA ===
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
import optuna

# Cấu hình
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings('ignore', category=UserWarning)
np.seterr(invalid="ignore", divide="ignore")

# Metric
import sys
sys.path.append("/kaggle/usr/lib/mabe-f-beta")
try:
    from metric import score
except ImportError:
    def score(*args, **kwargs): return 0.0

# =========================================================
# 1. CẤU HÌNH & SEED
# =========================================================
SEED = 42
def seed_everything(seed=42):
    np.random.seed(seed)
seed_everything(SEED)

INPUT_DIR = Path("/kaggle/input/MABe-mouse-behavior-detection")
TRAIN_TRACKING_DIR = INPUT_DIR / "train_tracking"
TRAIN_ANNOTATION_DIR = INPUT_DIR / "train_annotation"
TEST_TRACKING_DIR = INPUT_DIR / "test_tracking"

WORKING_DIR = Path("/kaggle/working")
RESULTS_DIR = Path(r"/kaggle/input/results-ensemble-optuna2")
RESULTS_DIR.mkdir(exist_ok=True, parents=True)

SELF_BEHAVIORS = ["biteobject", "climb", "dig", "exploreobject", "freeze", "genitalgroom", "huddle", "rear", "rest", "run", "selfgroom"]
PAIR_BEHAVIORS = ["allogroom", "approach", "attack", "attemptmount", "avoid", "chase", "chaseattack", "defend", "disengage", "dominance", "dominancegroom", "dominancemount", "ejaculate", "escape", "flinch", "follow", "intromit", "mount", "reciprocalsniff", "shepherd", "sniff", "sniffbody", "sniffface", "sniffgenital", "submit", "tussle"]
BAD_VIDEOS = []

# =========================================================
# 2. DATA LOADING & PREPARATION (NO CACHE)
# =========================================================

def load_metadata() -> pd.DataFrame:
    return pd.read_csv(INPUT_DIR / "train.csv")

def get_video_params(video_id: Any, meta: pd.DataFrame) -> Tuple[float, float]:
    row = meta.loc[meta["video_id"] == video_id]
    if row.empty: return 30.0, 1.0
    row = row.iloc[0]
    return float(row["frames_per_second"]), float(row["pix_per_cm_approx"])

def load_tracking(lab_id: str, video_id: Any, is_test=False) -> pd.DataFrame:
    d = TEST_TRACKING_DIR if is_test else TRAIN_TRACKING_DIR
    path = d / str(lab_id) / f"{video_id}.parquet"
    if not path.exists(): raise FileNotFoundError(path)
    return pd.read_parquet(path)

def load_annotation(lab_id: str, video_id: Any) -> pd.DataFrame:
    path = TRAIN_ANNOTATION_DIR / str(lab_id) / f"{video_id}.parquet"
    if not path.exists(): return pd.DataFrame(columns=["agent_id", "target_id", "action", "start_frame", "stop_frame"])
    return pd.read_parquet(path)[["agent_id", "target_id", "action", "start_frame", "stop_frame"]]

# Hàm lấy feature KHÔNG CACHE để tránh tràn RAM
def get_frame_features_no_cache(lab_id, video_id, agent_id, target_id, meta, is_test=False):
    if is_test:
        row = meta[meta["video_id"] == video_id].iloc[0]
        fps, pix = float(row["frames_per_second"]), float(row["pix_per_cm_approx"])
        pix = pix if np.isfinite(pix) and pix > 0 else 1.0
    else:
        fps, pix = get_video_params(video_id, meta)

    tracking = load_tracking(lab_id, video_id, is_test)
    
    # === GỌI CLASS FeatureExtractor (Đã có ở cell trước) ===
    fe = FeatureExtractor(fps=fps, pix_per_cm=pix, smooth_sigma=1.0, use_pairwise=True)
    
    frames, mouse_ids, pos, per_mouse_df = fe.build_pose_tensor(tracking)
    
    features_df = fe.extract_agent_target(
        frames=frames, mouse_ids=mouse_ids, pos=pos,
        agent_id=agent_id, target_id=target_id, per_mouse_df=per_mouse_df
    )
    features_df.index = frames
    return frames, features_df

def build_frame_dataset_for_lab_behavior(lab_id, behavior, train_meta, mode="self"):
    videos = train_meta[train_meta["lab_id"] == lab_id]["video_id"].unique().tolist()
    index_list, feature_list, label_list = [], [], []

    for video_id in videos:
        ann = load_annotation(lab_id, video_id)
        if ann.empty: continue
        
        ann_bhv = ann[ann["action"] == behavior]
        if ann_bhv.empty: continue

        pairs = ann_bhv[["agent_id", "target_id"]].drop_duplicates().values.tolist()
        for (agent_id, target_id) in pairs:
            target_id_use = agent_id if mode == "self" else target_id
            
            # Lấy features (tính trực tiếp)
            frames, feat_df = get_frame_features_no_cache(lab_id, video_id, agent_id, target_id_use, train_meta)

            ann_pair = ann_bhv[(ann_bhv["agent_id"] == agent_id) & (ann_bhv["target_id"] == target_id)]
            if ann_pair.empty and mode == "self": ann_pair = ann_bhv[ann_bhv["agent_id"] == agent_id]

            pos_frames = set()
            for _, r in ann_pair.iterrows(): pos_frames.update(range(int(r["start_frame"]), int(r["stop_frame"])))
            
            if not pos_frames: continue
            label = np.isin(frames, list(pos_frames)).astype("int8")
            if label.sum() == 0: continue

            # Lưu vào list và reset index ngay để giảm memory overhead
            index_list.append(pd.DataFrame({"video_id": video_id, "agent_id": agent_id, "target_id": target_id, "video_frame": frames}))
            feature_list.append(feat_df.reset_index(drop=True))
            label_list.append(label)
            
            # Dọn dẹp ngay
            del frames, feat_df, label

    if not index_list: return pd.DataFrame(), pd.DataFrame(), np.zeros(0, dtype="int8")
    
    return pd.concat(index_list, ignore_index=True), pd.concat(feature_list, ignore_index=True), np.concatenate(label_list).astype("int8")

# =========================================================
# 3. TRAINING & ENSEMBLE HELPERS
# =========================================================

def train_catboost_fold(X_tr, y_tr, X_va, y_va, sw=1.0):
    p = {
        'iterations': 1000, 'learning_rate': 0.05, 'depth': 6, 'scale_pos_weight': sw,
        'task_type': 'GPU', 'devices': '0', 'verbose': 0, 'allow_writing_files': False,
        'l2_leaf_reg': 5, 'bootstrap_type': 'Bernoulli', 'subsample': 0.8, 'random_seed': SEED
    }
    m = cb.CatBoostClassifier(**p)
    m.fit(cb.Pool(X_tr, y_tr), eval_set=cb.Pool(X_va, y_va), early_stopping_rounds=20, use_best_model=True)
    return m

def train_lightgbm_fold(X_tr, y_tr, X_va, y_va, sw=1.0):
    p = {
        'objective': 'binary', 'metric': 'binary_logloss', 'learning_rate': 0.05,
        'max_depth': 6, 'num_leaves': 31, 'scale_pos_weight': sw, 'device': 'gpu',
        'verbosity': -1, 'min_child_weight': 5, 'subsample': 0.8, 'colsample_bytree': 0.8,
        'subsample_freq': 1, 'seed': SEED
    }
    m = lgb.train(p, lgb.Dataset(X_tr, y_tr), 1000, valid_sets=[lgb.Dataset(X_va, y_va)], callbacks=[lgb.early_stopping(20, verbose=False)])
    return m

def optimize_ensemble_weights(oof_dict, y_true):
    models = list(oof_dict.keys())
    def obj(trial):
        w = [trial.suggest_float(m, 0.0, 1.0) for m in models]
        s = sum(w) + 1e-6; w = [x/s for x in w]
        p = np.zeros_like(y_true, dtype=float)
        for i, m in enumerate(models): p += oof_dict[m] * w[i]
        th = trial.suggest_float("th", 0.1, 0.9)
        return f1_score(y_true, (p >= th).astype(int), zero_division=0)
    
    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=SEED))
    study.optimize(obj, n_trials=50)
    best = study.best_params
    th = best.pop("th")
    rw = [best[m] for m in models]; s = sum(rw)+1e-6
    return {m: w/s for m, w in zip(models, rw)}, th

def train_validate_ensemble(lab_id, behavior, indices, features, labels):
    res_dir = RESULTS_DIR / lab_id / behavior
    res_dir.mkdir(parents=True, exist_ok=True)

    if len(labels) == 0 or labels.sum() == 0: return 0.0

    X = features.values.astype("float32")
    y = labels.astype("int8")
    groups = indices["video_id"].values
    
    oof_preds = {m: np.zeros(len(y), dtype="float32") for m in ["xgb", "cat", "lgb"]}
    folds = np.ones(len(y), dtype="int8") * -1

    cv = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=SEED)
    for fold, (tr_idx, va_idx) in enumerate(cv.split(X, y, groups=groups)):
        print(f"   Fold {fold}...", end=" ")
        fd_dir = res_dir / f"fold_{fold}"; fd_dir.mkdir(parents=True, exist_ok=True)
        X_tr, y_tr = X[tr_idx], y[tr_idx]; X_va, y_va = X[va_idx], y[va_idx]
        pos = y_tr.sum(); neg = len(y_tr) - pos
        sw = float(neg/pos) if pos > 0 else 1.0

        # 1. XGBoost
        dtr = xgb.QuantileDMatrix(X_tr, label=y_tr, feature_names=features.columns.tolist(), max_bin=64)
        dva = xgb.DMatrix(X_va, label=y_va, feature_names=features.columns.tolist())
        xp = {
            "objective":"binary:logistic", "eval_metric":"logloss", "device":"cuda", 
            "tree_method":"hist", "learning_rate":0.05, "max_depth":6, "scale_pos_weight":sw,
            "min_child_weight":5, "subsample":0.8, "colsample_bytree":0.8, "max_bin":64, "seed": SEED
        }
        
        # === ĐÃ THÊM 'evals=' VÀO DÒNG DƯỚI ===
        mx = xgb.train(
            params=xp, 
            dtrain=dtr, 
            num_boost_round=1000, 
            evals=[(dva, "valid")],
            callbacks=[xgb.callback.EarlyStopping(rounds=20, save_best=True)], 
            verbose_eval=False
        )
        mx.save_model(fd_dir / "model_xgb.json")
        oof_preds["xgb"][va_idx] = mx.predict(dva)

        # 2. CatBoost
        mc = train_catboost_fold(X_tr, y_tr, X_va, y_va, sw)
        mc.save_model(str(fd_dir / "model_cat.cbm"))
        oof_preds["cat"][va_idx] = mc.predict_proba(X_va)[:,1]

        # 3. LightGBM
        ml = train_lightgbm_fold(X_tr, y_tr, X_va, y_va, sw)
        ml.save_model(fd_dir / "model_lgb.txt")
        oof_preds["lgb"][va_idx] = ml.predict(X_va)
        folds[va_idx] = fold
        
        print("Done.")
        del X_tr, y_tr, X_va, y_va, dtr, dva, mx, mc, ml
        gc.collect()

    print("   Optimizing Weights...", end=" ")
    weights, th = optimize_ensemble_weights(oof_preds, y)
    with open(res_dir / "ensemble_params.json", "w") as f: json.dump({"weights": weights, "threshold": th}, f)
    
    final_pred = sum(oof_preds[m] * weights[m] for m in weights)
    final_lbl = (final_pred >= th).astype("int8")
    
    # Save OOF
    df = indices.copy(); df["fold"] = folds; df["pred"] = final_pred; df["lbl"] = final_lbl
    df.to_parquet(res_dir / "oof.parquet", index=False)
    
    f1 = f1_score(y, final_lbl, zero_division=0)
    print(f"Best F1: {f1:.4f} (Th={th:.2f}, W={weights})")
    (res_dir / "f1.txt").write_text(f"{f1:.6f}")
    return float(f1)

# =========================================================
# 4. INFERENCE
# =========================================================

def load_ensemble_models(lab_id, behavior):
    base = RESULTS_DIR / lab_id / behavior
    if not base.exists(): return []
    models = []
    for fd in sorted(base.glob("fold_*")):
        if not (fd / "model_xgb.json").exists(): continue
        
        xgb_b = xgb.Booster(); xgb_b.load_model(str(fd / "model_xgb.json"))
        cat_m = cb.CatBoostClassifier(); 
        try: cat_m.load_model(str(fd / "model_cat.cbm"))
        except: cat_m = None
        try: lgb_m = lgb.Booster(model_file=str(fd / "model_lgb.txt"))
        except: lgb_m = None
        models.append({"xgb": xgb_b, "cat": cat_m, "lgb": lgb_m})
    return models

def predict_behaviors_for_pair(lab_id, video_id, aid, tid, behaviors, test_meta):
    if lab_id != "NiftyGoldfinch": return None
    frames, feat_df = get_frame_features_no_cache(lab_id, video_id, aid, tid, test_meta, is_test=True)
    if feat_df.empty: return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])
    
    scores = {}
    for bhv in behaviors:
        base = RESULTS_DIR / lab_id / bhv
        if not (base / "ensemble_params.json").exists(): continue
        with open(base / "ensemble_params.json") as f: p = json.load(f)
        ws, th = p["weights"], p["threshold"]
        
        folds = load_ensemble_models(lab_id, bhv)
        if not folds: continue
        
        cols = folds[0]["xgb"].feature_names
        X = pd.DataFrame(0.0, index=feat_df.index, columns=cols, dtype=np.float32)
        c = list(set(cols) & set(feat_df.columns))
        if c: X[c] = feat_df[c]
        dtest = xgb.DMatrix(X, feature_names=cols)
        
        agg = np.zeros(len(feat_df), dtype=np.float32)
        for m in folds:
            px = m["xgb"].predict(dtest)
            pc = m["cat"].predict_proba(X)[:,1] if m["cat"] else np.zeros_like(px)
            pl = m["lgb"].predict(X) if m["lgb"] else np.zeros_like(px)
            
            avg = px*ws.get("xgb", 0.33) + pc*ws.get("cat", 0.33) + pl*ws.get("lgb", 0.33)
            agg += avg * (avg >= th).astype("int8")
        
        if folds: scores[bhv] = agg / len(folds)
        
        del X, dtest
        gc.collect()

    if not scores: return pd.DataFrame(columns=["video_id", "action", "start_frame", "stop_frame"])
    
    bl = list(scores.keys()); mat = np.vstack([scores[b] for b in bl]).T
    lbls = np.where(mat.max(1)==0, "none", np.array(bl)[mat.argmax(1)])
    
    segs = []; prev = "none"; start = None; pf = None
    for f, l in zip(frames, lbls):
        if l != prev:
            if prev != "none": segs.append({"video_id": int(video_id), "action": prev, "start_frame": int(start), "stop_frame": int(pf)+1})
            prev = l; start = f
        pf = f
    if prev != "none": segs.append({"video_id": int(video_id), "action": prev, "start_frame": int(start), "stop_frame": int(pf)+1})
    
    return pd.DataFrame(segs)

# =========================================================
# 5. MAIN
# =========================================================
target_lab = "NiftyGoldfinch"
print("\n=== START INFERENCE ===")
test_meta = pd.read_csv(INPUT_DIR / "test.csv")
test_meta = test_meta[test_meta["lab_id"] == target_lab].reset_index(drop=True)

trained = sorted([p.name for p in (RESULTS_DIR/target_lab).iterdir() if p.is_dir()])
sb, pb = [b for b in trained if b in SELF_BEHAVIORS], [b for b in trained if b in PAIR_BEHAVIORS]

all_segs = []
def fid(i): return str(i) if str(i).startswith("mouse") else f"mouse{i}"

for vid in sorted(test_meta["video_id"].unique()):
    print(f"Predicting Video {vid}...")
    tr = load_tracking(target_lab, vid, is_test=True)
    mids = sorted(tr["mouse_id"].unique())
    
    if sb:
        for m in mids:
            df = predict_behaviors_for_pair(target_lab, vid, m, m, sb, test_meta)
            if df is not None and not df.empty:
                df["agent_id"] = fid(m); df["target_id"] = "self"
                all_segs.append(df)
    if pb and len(mids) > 1:
        for a, t in itertools.permutations(mids, 2):
            df = predict_behaviors_for_pair(target_lab, vid, a, t, pb, test_meta)
            if df is not None and not df.empty:
                df["agent_id"] = fid(a); df["target_id"] = fid(t)
                all_segs.append(df)
    del tr
    gc.collect()

cols = ["video_id", "agent_id", "target_id", "action", "start_frame", "stop_frame"]

if all_segs:
    sub3 = pd.concat(all_segs, ignore_index=True)
    sub3 = sub3[cols].sort_values(["video_id", "agent_id", "target_id", "action", "start_frame"]).reset_index(drop=True)

    sub3.insert(0, "row_id", np.arange(len(sub3), dtype=np.int64))
else:
    # CSV rỗng (0 dòng) nhưng có đủ cột + row_id
    sub3 = pd.DataFrame(columns=["row_id"] + cols)

sub3.to_csv(WORKING_DIR / "submission9.csv", index=False)
print(f"\nDone! Saved submission to {WORKING_DIR / 'submission9.csv'}")


=== START INFERENCE ===

Done! Saved submission to /kaggle/working/submission9.csv


# end

In [18]:
from pathlib import Path
import shutil
import gc

WORKING_DIR = Path("/kaggle/working")

# 1) Xóa mọi thứ trong /kaggle/working trừ .csv
for path in WORKING_DIR.iterdir():
    # giữ lại file .csv
    if path.is_file() and path.suffix == ".csv":
        continue

    if path.is_file():
        try:
            path.unlink()
        except Exception as e:
            print(f"Cannot remove file {path}: {e}")
    elif path.is_dir():
        try:
            shutil.rmtree(path, ignore_errors=True)
        except Exception as e:
            print(f"Cannot remove dir {path}: {e}")


gc.collect()

78

In [19]:
# ============================================================
# Improved MABe Social Behavior Detection with XGBoost
# Improved inference notebook (fold aggregation + postprocessing)
# ============================================================

from pathlib import Path
import os
import sys

# ------------------------------------------------------------
# Input dataset checks
# ------------------------------------------------------------
COMP_DIR = Path("/kaggle/input/MABe-mouse-behavior-detection")
STARTER_DIR = Path("/kaggle/input/mabe-starter-train-ja")
MABE_PKG_DIR = Path("/kaggle/input/mabe-package")

if not COMP_DIR.exists():
    raise FileNotFoundError(
        "Competition dataset 'MABe Challenge - Social Action Recognition in Mice' "
        "must be attached as an input."
    )

if not STARTER_DIR.exists():
    raise FileNotFoundError(
        "Dataset 'mabe-starter-train-ja' is not attached. "
        "Click 'Add input' and add it before running."
    )

if not MABE_PKG_DIR.exists():
    raise FileNotFoundError(
        "Dataset 'mabe-package' is not attached. "
        "It provides the offline xgboost wheel used by the starter models."
    )

# ------------------------------------------------------------
# Install xgboost from offline wheel (no internet)
# ------------------------------------------------------------
!pip install -q --no-index --find-links=/kaggle/input/mabe-package xgboost==3.1.1

# ------------------------------------------------------------
# Copy helper scripts and trained models from starter dataset
# ------------------------------------------------------------
!cp /kaggle/input/mabe-starter-train-ja/self_features.py .
!cp /kaggle/input/mabe-starter-train-ja/pair_features.py .
!cp /kaggle/input/mabe-starter-train-ja/robustify.py .
!cp -r /kaggle/input/mabe-starter-train-ja/results .

# ============================================================
# Imports
# ============================================================
import gc
import re
import ast
import itertools
from pathlib import Path

import numpy as np

# polars is preinstalled on Kaggle GPU/CPU images
try:
    import polars as pl
except ImportError:
    raise ImportError(
        "polars is not available in this environment. "
        "Use a Kaggle GPU/CPU notebook image where polars is preinstalled."
    )

import xgboost as xgb
from tqdm.auto import tqdm

# Helper scripts from starter notebook
%run -i self_features.py
%run -i pair_features.py
%run -i robustify.py

# ============================================================
# Paths and constants
# ============================================================
INPUT_DIR = COMP_DIR
TRAIN_TRACKING_DIR = INPUT_DIR / "train_tracking"
TRAIN_ANNOTATION_DIR = INPUT_DIR / "train_annotation"
TEST_TRACKING_DIR = INPUT_DIR / "test_tracking"

WORKING_DIR = Path("/kaggle/working")
WORKING_DIR.mkdir(parents=True, exist_ok=True)

SELF_FEATURE_DIR = WORKING_DIR / "self_features"
PAIR_FEATURE_DIR = WORKING_DIR / "pair_features"
SELF_FEATURE_DIR.mkdir(parents=True, exist_ok=True)
PAIR_FEATURE_DIR.mkdir(parents=True, exist_ok=True)

INDEX_COLS = [
    "video_id",
    "agent_mouse_id",
    "target_mouse_id",
    "video_frame",
]

BODY_PARTS = [
    "ear_left",
    "ear_right",
    "nose",
    "neck",
    "body_center",
    "lateral_left",
    "lateral_right",
    "hip_left",
    "hip_right",
    "tail_base",
    "tail_tip",
]

SELF_BEHAVIORS = [
    "biteobject",
    "climb",
    "dig",
    "exploreobject",
    "freeze",
    "genitalgroom",
    "huddle",
    "rear",
    "rest",
    "run",
    "selfgroom",
]

PAIR_BEHAVIORS = [
    "allogroom",
    "approach",
    "attack",
    "attemptmount",
    "avoid",
    "chase",
    "chaseattack",
    "defend",
    "disengage",
    "dominance",
    "dominancegroom",
    "dominancemount",
    "ejaculate",
    "escape",
    "flinch",
    "follow",
    "intromit",
    "mount",
    "reciprocalsniff",
    "shepherd",
    "sniff",
    "sniffbody",
    "sniffface",
    "sniffgenital",
    "submit",
    "tussle",
]

# ============================================================
# Helper functions
# ============================================================

def parse_behaviors_column(behaviors_str: str):
    """
    behaviors_labeled is stored as a Python like list of tuples.
    Use ast.literal_eval for safety instead of eval.

    Example:
      "[('mouse1','mouse2','sniff'), ('mouse2','mouse1','sniff')]"
    """
    if behaviors_str is None:
        return []
    return ast.literal_eval(behaviors_str)


def build_behavior_dataframe(test_df: pl.DataFrame) -> pl.DataFrame:
    """
    Expand behaviors_labeled into one row per (lab, video, agent, target, behavior).
    """
    behavior_df = (
        test_df
        .filter(pl.col("behaviors_labeled").is_not_null())
        .select(["lab_id", "video_id", "behaviors_labeled"])
        .with_columns(
            pl.col("behaviors_labeled")
            .map_elements(
                parse_behaviors_column,
                return_dtype=pl.List(pl.Utf8),
            )
            .alias("behaviors_labeled_list")
        )
        .explode("behaviors_labeled_list")
        .rename({"behaviors_labeled_list": "behaviors_labeled_element"})
        .with_columns(
            pl.col("behaviors_labeled_element").str.split(",").list.get(0)
            .str.replace_all("[()' ]", "")
            .alias("agent"),
            pl.col("behaviors_labeled_element").str.split(",").list.get(1)
            .str.replace_all("[()' ]", "")
            .alias("target"),
            pl.col("behaviors_labeled_element").str.split(",").list.get(2)
            .str.replace_all("[()' ]", "")
            .alias("behavior"),
        )
        .select(["lab_id", "video_id", "agent", "target", "behavior"])
    )
    return behavior_df


def extract_mouse_id(mouse_str: str) -> int:
    """
    Convert 'mouse1' -> 1, 'mouse2' -> 2, 'self' -> -1.
    """
    if mouse_str == "self":
        return -1
    m = re.search(r"mouse(\d+)", mouse_str)
    if m:
        return int(m.group(1))
    raise ValueError(f"Unexpected mouse id format: {mouse_str}")


def load_features_for_group(lab_id, video_id, agent, target):
    """
    Load per frame features for a given (lab, video, agent, target) group.
    Returns:
      index_df   - DataFrame with INDEX_COLS
      feature_df - DataFrame with feature columns only
    """
    agent_mouse_id = extract_mouse_id(agent)
    target_mouse_id = extract_mouse_id(target)

    if target == "self":
        feature_path = SELF_FEATURE_DIR / f"{video_id}.parquet"
        scan = pl.scan_parquet(feature_path).filter(
            pl.col("agent_mouse_id") == agent_mouse_id
        )
    else:
        feature_path = PAIR_FEATURE_DIR / f"{video_id}.parquet"
        scan = pl.scan_parquet(feature_path).filter(
            (pl.col("agent_mouse_id") == agent_mouse_id)
            & (pl.col("target_mouse_id") == target_mouse_id)
        )

    full_df = scan.collect()
    if full_df.height == 0:
        return full_df, full_df

    index_df = full_df.select(INDEX_COLS)
    feature_df = full_df.select(pl.exclude(INDEX_COLS))
    return index_df, feature_df


def load_models_for_behavior(lab_id: str, behavior: str):
    """
    Load all fold models and thresholds for a given (lab, behavior).
    Returns list of (model, threshold).
    """
    behavior_dir = WORKING_DIR / "results" / lab_id / behavior
    fold_dirs = sorted(behavior_dir.glob("fold_*"))
    models = []
    for fold_dir in fold_dirs:
        model_file = fold_dir / "model.json"
        thr_file = fold_dir / "threshold.txt"
        if not model_file.exists() or not thr_file.exists():
            continue
        with open(thr_file, "r") as f:
            threshold = float(f.read().strip())
        model = xgb.Booster(model_file=str(model_file))
        models.append((model, threshold))
    return models


def predict_for_group(
    lab_id: str,
    video_id: int,
    agent: str,
    target: str,
    group_behaviors: pl.DataFrame,
):
    """
    Run inference for one group of (lab_id, video_id, agent, target).

    Improvements:
      - Aggregate folds per behavior into a single score column
        (mean of thresholded probabilities).
      - Pick best behavior per frame using those aggregated scores.
    """
    my_labs = ["AdaptableSnail", "BoisterousParrot", "ElegantMink", "GroovyShrew", "JovialSwallow", "PleasantMeerkat", "SparklingTapir", "TranquilPanther", "NiftyGoldfinch"]
    if lab_id not in my_labs:
        index_df, feature_df = load_features_for_group(lab_id, video_id, agent, target)
    
        if feature_df.height == 0:
            return None
    
        # Create XGBoost DMatrix once per group and reuse across behaviors
        dtest = xgb.DMatrix(feature_df.to_pandas(), feature_names=feature_df.columns)
    
        prediction_df = index_df.clone()
        used_cols = []
    
        # Unique behaviors for this group
        unique_behaviors = (
            group_behaviors.select("behavior").unique()["behavior"].to_list()
        )
    
        for behavior in unique_behaviors:
            models = load_models_for_behavior(lab_id, behavior)
            if not models:
                # No trained model for this (lab, behavior) in the starter models
                continue
    
            # Aggregate over folds: mean of thresholded probabilities
            agg_scores = np.zeros(feature_df.height, dtype=np.float32)
    
            for model, threshold in models:
                probs = model.predict(dtest)
                labels = (probs >= threshold).astype(np.int8)
                agg_scores += probs * labels
    
            agg_scores /= max(len(models), 1)
    
            col_name = behavior
            prediction_df = prediction_df.with_columns(
                pl.Series(name=col_name, values=agg_scores)
            )
            used_cols.append(col_name)
    
        if not used_cols:
            return None
    
        # Pick best behavior per frame (over behaviors only)
        cols = used_cols
    
        prediction_labels_df = (
            prediction_df
            .with_columns(
                pl.struct(pl.col(cols))
                .map_elements(
                    lambda row: (
                        "none"
                        if sum(row.values()) == 0
                        else cols[int(np.argmax(list(row.values())))]
                    ),
                    return_dtype=pl.String,
                )
                .alias("prediction")
            )
            .select(INDEX_COLS + ["prediction"])
        )
    
        # Convert per frame labels into time segments
        agent_mouse_id = extract_mouse_id(agent)
        target_mouse_id = extract_mouse_id(target)
    
        group_submission = (
            prediction_labels_df
            .filter(pl.col("prediction") != pl.col("prediction").shift(1))
            .with_columns(
                pl.col("video_frame").shift(-1).alias("stop_frame")
            )
            .filter(pl.col("prediction") != "none")
            .select(
                pl.col("video_id"),
                (pl.lit("mouse") + pl.lit(agent_mouse_id).cast(pl.Utf8)).alias("agent_id"),
                pl.when(pl.lit(target_mouse_id) == -1)
                .then(pl.lit("self"))
                .otherwise(pl.lit("mouse") + pl.lit(target_mouse_id).cast(pl.Utf8))
                .alias("target_id"),
                pl.col("prediction").alias("action"),
                pl.col("video_frame").alias("start_frame"),
                pl.col("stop_frame"),
            )
        )
    
        return group_submission

# ============================================================
# 1. Load metadata and build behavior table
# ============================================================
print("Loading test metadata...")
test_df = pl.read_csv(INPUT_DIR / "test.csv")

print("Building behavior table from behaviors_labeled...")
behavior_df = build_behavior_dataframe(test_df)

groups = list(
    behavior_df.group_by("lab_id", "video_id", "agent", "target", maintain_order=True)
)
print(f"Number of (lab, video, agent, target) groups: {len(groups)}")

# ============================================================
# 2. Pre compute features for all videos
# ============================================================
print("Generating self and pair features for all test videos...")

rows = test_df.rows(named=True)

for row in tqdm(rows, total=len(rows)):
    lab_id = row["lab_id"]
    video_id = row["video_id"]

    tracking_path = TEST_TRACKING_DIR / f"{lab_id}/{video_id}.parquet"
    tracking = pl.read_parquet(tracking_path)

    self_feat = make_self_features(metadata=row, tracking=tracking)
    pair_feat = make_pair_features(metadata=row, tracking=tracking)

    self_feat.write_parquet(SELF_FEATURE_DIR / f"{video_id}.parquet")
    pair_feat.write_parquet(PAIR_FEATURE_DIR / f"{video_id}.parquet")

    del self_feat, pair_feat, tracking
    gc.collect()

# ============================================================
# 3. Inference by group and segment construction
# ============================================================
print("Running inference and building group submissions...")

group_submissions = []

for (lab_id, video_id, agent, target), group in tqdm(groups, total=len(groups)):
    group_submission = predict_for_group(
        lab_id=lab_id,
        video_id=video_id,
        agent=agent,
        target=target,
        group_behaviors=group,
    )

    if group_submission is not None and group_submission.height > 0:
        group_submissions.append(group_submission)

# ============================================================
# 4. Robustify and final clean up (always create submission_not)
# ============================================================
if group_submissions:
    # Có dữ liệu → concat + robustify
    submission_not = pl.concat(group_submissions, how="vertical").sort(
        "video_id",
        "agent_id",
        "target_id",
        "action",
        "start_frame",
        "stop_frame",
    )

    print("Initial submission_not rows:", submission_not.height)

    print("Running robustify on submission_not...")
    submission_not = robustify(submission_not, test_df, train_test="test")

    # Keep only valid intervals
    submission_not = submission_not.filter(
        pl.col("start_frame") < pl.col("stop_frame")
    )

    # Drop ultra short segments (likely noise)
    submission_not = submission_not.with_columns(
        (pl.col("stop_frame") - pl.col("start_frame")).alias("duration")
    ).filter(pl.col("duration") >= 2).drop("duration")

    print(
        "Rows after robustify, validity check and duration filter:",
        submission_not.height,
    )
else:
    # Không có group_submissions → tạo DF rỗng đúng schema, KHÔNG gọi robustify
    print("No group submissions found, creating empty submission_not DataFrame.")
    submission_not = pl.DataFrame(
        schema={
            "video_id": pl.Int64,
            "agent_id": pl.Utf8,
            "target_id": pl.Utf8,
            "action": pl.Utf8,
            "start_frame": pl.Int64,
            "stop_frame": pl.Int64,
        }
    )

# Add row_id and save
final_submission = submission_not.with_row_index("row_id")
final_path = WORKING_DIR / "submission_not.csv"
final_submission.write_csv(final_path)

print("Saved submission_not to:", final_path)

Loading test metadata...
Building behavior table from behaviors_labeled...
Number of (lab, video, agent, target) groups: 16
Generating self and pair features for all test videos...


  0%|          | 0/1 [00:00<?, ?it/s]

Running inference and building group submissions...


  0%|          | 0/16 [00:00<?, ?it/s]

No group submissions found, creating empty submission_not DataFrame.
Saved submission_not to: /kaggle/working/submission_not.csv


In [20]:
from pathlib import Path
import shutil
import gc

WORKING_DIR = Path("/kaggle/working")

# 1) Xóa mọi thứ trong /kaggle/working trừ .csv
for path in WORKING_DIR.iterdir():
    # giữ lại file .csv
    if path.is_file() and path.suffix == ".csv":
        continue

    if path.is_file():
        try:
            path.unlink()
        except Exception as e:
            print(f"Cannot remove file {path}: {e}")
    elif path.is_dir():
        try:
            shutil.rmtree(path, ignore_errors=True)
        except Exception as e:
            print(f"Cannot remove dir {path}: {e}")


gc.collect()

38

In [21]:
from pathlib import Path
import pandas as pd
import numpy as np

WORKING_DIR = Path("/kaggle/working")

sub_file0 = WORKING_DIR / "submission_not.csv"
sub_file1 = WORKING_DIR / "submission1.csv"
sub_file2 = WORKING_DIR / "submission2.csv"
sub_file3 = WORKING_DIR / "submission3.csv"
sub_file4 = WORKING_DIR / "submission4.csv"
sub_file5 = WORKING_DIR / "submission5.csv"
sub_file6 = WORKING_DIR / "submission6.csv"
sub_file7 = WORKING_DIR / "submission7.csv"
sub_file8 = WORKING_DIR / "submission8.csv"
sub_file9 = WORKING_DIR / "submission9.csv"

dfs = []

for f in [sub_file1, sub_file2, sub_file3, sub_file4, sub_file5, sub_file6, sub_file7, sub_file8, sub_file9, sub_file0]:
    if f.exists():
        df = pd.read_csv(f)
        if "row_id" in df.columns:
            df = df.drop(columns=["row_id"])
        dfs.append(df)

if dfs:
    merged = pd.concat(dfs, ignore_index=True)
    merged = merged.sort_values(
        ["video_id", "agent_id", "target_id", "action", "start_frame"],
        ignore_index=True,
    )
    merged.insert(0, "row_id", np.arange(len(merged), dtype=np.int64))
else:
    merged = pd.DataFrame(
        columns=[
            "row_id",
            "video_id",
            "agent_id",
            "target_id",
            "action",
            "start_frame",
            "stop_frame",
        ]
    )

out_path = WORKING_DIR / "submission.csv"
merged.to_csv(out_path, index=False)
print("Saved merged submission to:", out_path)


Saved merged submission to: /kaggle/working/submission.csv


# final