# Stage 1: One Hunter vs Static Targets
## Libraries

In [261]:
import os, sys, csv, cv2, imageio
import pandas as pd
import numpy as np
np.bool = bool
np.long = int
np.int = int

import torch, torch.nn as nn, torch.optim as optim
torch.set_num_threads(1)

import matplotlib.pyplot as plt
from IPython.display import HTML  # (for display_side_by_side)from collections import deque
from tqdm.auto import tqdm
import shutil
import math
import copy
from math import inf
from dataclasses import dataclass
import inspect
import random
import pickle
sys.path.append("C:/Users/aveexela/Desktop/rl_project")

from world.realm import Realm
from world.map_loaders.base import MixedMapLoader
from world.map_loaders.single_team import (
    SingleTeamLabyrinthMapLoader,
    SingleTeamRocksMapLoader,
)
from world.map_loaders.pregenerated_loader import PregeneratedMapLoader
from world.envs import OnePlayerEnv
from world.utils import RenderedEnvWrapper
from world.scripted_agents import ClosestTargetAgent

from world.envs import VersusBotEnv
from world.realm import Realm
from world.map_loaders.base import MixedMapLoader
from world.map_loaders.pregenerated_loader import PregeneratedMapLoader
from world.map_loaders.two_teams import (
    TwoTeamRocksMapLoader, TwoTeamLabyrinthMapLoader
)
from world.map_loaders.single_team import (
    SingleTeamRocksMapLoader, SingleTeamLabyrinthMapLoader
)
from world.scripted_agents import ClosestTargetAgent

## Settings

In [262]:
TEAM_SIZE = 5  # пять hunterов
MAP_SIZE = 40       # << был 20PREYS = 100
STEPS = 300        # лимит шагов эпизодаLR = 1e-4       # единый LR, без дубля
LOG_DIR   = os.environ.get("LOG_DIR", "logs")
FRAME_DIR = os.path.join(LOG_DIR, "frames")
MAP_DIR   = os.path.join(LOG_DIR, "maps")
LOG_STEP_PATH = os.path.join(LOG_DIR, "step_log.csv")

for d in (LOG_DIR, FRAME_DIR, MAP_DIR, os.path.join(LOG_DIR, "checkpoints"),
          os.path.join(LOG_DIR, "checkpoints_pvp"),
          os.path.join(LOG_DIR, "checkpoints_pvp_eval")):
    os.makedirs(d, exist_ok=True)

# — удаляем папки с картинками, if были, and создаём заново —for d in (FRAME_DIR, MAP_DIR):
    if os.path.isdir(d):
        shutil.rmtree(d)
os.makedirs(FRAME_DIR, exist_ok=True)
os.makedirs(MAP_DIR,   exist_ok=True)

# overwrite CSV-хедер (as and раньше)if os.path.exists(LOG_STEP_PATH):
    os.remove(LOG_STEP_PATH)
    print(f"Старый лог удалён: {LOG_STEP_PATH}")
    
with open(LOG_STEP_PATH, "w", newline="") as f:
    csv.writer(f).writerow([
        "phase", "episode", "step",
        "pred_id", "pred_y", "pred_x",
        "exec_action", "teacher_action",
        "alive_preys", "caught_total", "new_caught",
        "team_score", "score_delta",
        "reward", "r_base", "r_capture", "r_explore", "r_standstill", "r_revisit", "r_bfs", "r_repulse",
        "r_flipflop", "r_same_dir_close",           # <— new        "idle_preds", "pair_d0", "pair_d1", "team_disp",
        "nearest_prey_y", "nearest_prey_x", "dy_to_nearest", "dx_to_nearest",
        "caught_coords"
    ])


# for eval используем фикс-visited-map matrix под текущий MAP_SIZEvisited_map = np.zeros((MAP_SIZE, MAP_SIZE), dtype=np.int32)
STEP_LOG_BUFFER = []
# кэш векtorusов движения за прошлый step: {pred_id: (dy, dx)}LAST_DIRS = {}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# фичи for NetAgent1PATCH_SIZE = 7
K_NEAREST  = 4

# === REWARD COEFFS (обновить values) ===exploration_coef     = 0.002   # было 0.01 — сильно раздувало «бродилки»stand_still_penalty  = 0.004   # было 0.005revisit_penalty      = 0.0007  # было 0.001
# потенциальный shaping будет основным:shaping_coef         = 0.02    # оставим таким — но он теперь «безопасный» (разность потенциала)
# ранний буст исследования — помягче:early_steps_boost    = 20
early_explore_scale  = 1.4

# repulsion сильно помягчить + cap:repulse_same_cell    = 0.02    # было 0.05repulse_adjacent     = 0.01    # было 0.02repulse_radius2      = 0.005   # было 0.01repulse_boost        = 1.5     # было 2.0REPULSE_CAP_PER_AGENT = 0.05   # new: кэп суммарного repulsionа на агента за stepREPULSE_KICKS_IN_AFTER = 40    # new: enable repulsion только после N шагов эпизода
# anti-«пиление» and «lockstep» — только if прогресса к жертве нет:flipflop_penalty        = 0.01   # было 0.03same_dir_close_penalty  = 0.007  # было 0.015
# bonus плитка — очень маленький, and не вредить погоне:bonus_reward_base = 0.3          # было 2.0 — слишком жирноBONUS_TAKEN = set()

Device: cpu


## Agent Model

In [263]:
class FeatureBuilder:
    """
    Готовит фиксированный по длине векtorus признаков for ОДНОГО hunterа when ЛЮБОМ размере maps.
    Состав: [голова (5)] + [патч P×P] + [K_preys*3] + [K_mates*3].
      - K_preys: K closest жертв: [dy, dx, dn]
      - K_mates: K closest teammates (кроме себя): [dy, dx, dn]
    Все dy/dx нормированы на H/W, dn — на (H+W).
    """
    def __init__(self, patch_size=PATCH_SIZE, k_nearest=K_NEAREST, k_mates: int = 2):
        self.patch_size = patch_size
        self.k_nearest  = k_nearest
        self.k_mates    = k_mates

    def __call__(self, info: dict, world_map, pred_idx: int = 0) -> np.ndarray:
        wm = np.array(world_map)
        H, W = wm.shape[:2]
        # mask passability        if wm.ndim == 3:
            walls = (wm[:, :, 0] == -1) & (wm[:, :, 1] == -1)
        else:
            walls = (wm == -1)
        passable = (~walls).astype(np.float32)  # [H, W]
        # --- этот hunter ---        p = info["predators"][pred_idx]
        y, x = int(p["y"]), int(p["x"])

        # --- alive preys ---        preys = [q for q in info["preys"] if q["alive"]]
        alive_cnt = len(preys)

        # --- локальный патч ---        patch = extract_toroidal_patch(passable, y, x, self.patch_size).reshape(1, -1)

        # --- K closest жертв ---        # --- K closest жертв (torus) ---        prey_feats = []
        if preys:
            dgrid = shortest_path_grid_from_torus(wm, (y, x))
            for i in range(min(self.k_nearest, len(preys))):
                pass  # заполним ниже из отсортированного списка
            dlist = [(float(dgrid[int(q["y"]) % H, int(q["x"]) % W]), q) for q in preys]
            dlist.sort(key=lambda t: t[0])
            for i in range(min(self.k_nearest, len(dlist))):
                d, q = dlist[i]
                if not np.isfinite(d):
                    dy = dx = 0.0; dn = 1.0
                else:
                    qy, qx = int(q["y"]) % H, int(q["x"]) % W
                    dy = ( (qy - y) % H ); dy = min(dy, H - dy) / max(1, H)  # torus-нормировка                    dx = ( (qx - x) % W ); dx = min(dx, W - dx) / max(1, W)
                    dn = float(min(d, H + W) / (H + W))
                prey_feats.extend([dy, dx, dn])
                
        # паддинг        need = self.k_nearest * 3
        while len(prey_feats) < need:
            prey_feats.extend([0.0, 0.0, 1.0])

        # --- K closest тиммейтов (кроме себя) ---        mates = [q for i, q in enumerate(info["predators"]) if i != pred_idx and q.get("alive", True)]
        mate_feats = []
        if mates:
            # Manhattan            dlist = []
            for q in mates:
                qy, qx = int(q["y"]), int(q["x"])
                d = abs(qy - y) + abs(qx - x)
                dlist.append((d, qy, qx))
            dlist.sort(key=lambda t: t[0])
            for i in range(min(self.k_mates, len(dlist))):
                d, qy, qx = dlist[i]
                dy = (qy - y) / max(1, H)
                dx = (qx - x) / max(1, W)
                dn = float(min(d, H + W) / (H + W))
                mate_feats.extend([dy, dx, dn])
        # паддинг        need_m = self.k_mates * 3
        while len(mate_feats) < need_m:
            mate_feats.extend([0.0, 0.0, 1.0])

        # --- голова ---        alive_pred = float(p["alive"])
        nearest_dn = prey_feats[2] if len(prey_feats) >= 3 else 1.0
        head = np.array([[y / max(1, H),
                          x / max(1, W),
                          alive_pred,
                          alive_cnt / max(1, H * W),
                          nearest_dn]], dtype=np.float32)

        feats = np.concatenate(
            [head, patch, np.array(prey_feats, dtype=np.float32).reshape(1, -1),
             np.array(mate_feats, dtype=np.float32).reshape(1, -1)],
            axis=1
        )
        return feats

class FeatureBuilderPvP(FeatureBuilder):
    """
    Расширение: добавляем K closest enemies [dy,dx,dn] с torus-нормировкой.
    """
    def __init__(self, patch_size=PATCH_SIZE, k_nearest=K_NEAREST, k_mates=2, k_enemies=2):
        super().__init__(patch_size, k_nearest, k_mates)
        self.k_enemies = k_enemies

    def __call__(self, info: dict, world_map, pred_idx: int = 0) -> np.ndarray:
        base = super().__call__(info, world_map, pred_idx)  # уже содержит head+patch+preys+mates        wm = np.array(world_map)
        H, W = wm.shape[:2]
        # враги ожидаются в info["enemies"] таким же форматом, as "predators"        enemies = [q for q in info.get("enemies", []) if q.get("alive", True)]
        p = info["predators"][pred_idx]
        y, x = int(p["y"]), int(p["x"])

        enemy_feats = []
        if enemies:
            # torus-Manhattan            dlist = []
            for q in enemies:
                qy, qx = int(q["y"]), int(q["x"])
                d = manhattan_torus(y, x, qy, qx, H, W)
                dlist.append((d, qy, qx))
            dlist.sort(key=lambda t: t[0])
            for i in range(min(self.k_enemies, len(dlist))):
                d, qy, qx = dlist[i]
                dy = ((qy - y) % H); dy = min(dy, H - dy) / max(1, H)
                dx = ((qx - x) % W); dx = min(dx, W - dx) / max(1, W)
                dn = float(min(d, H + W) / (H + W))
                enemy_feats.extend([dy, dx, dn])
        need_e = self.k_enemies * 3
        while len(enemy_feats) < need_e:
            enemy_feats.extend([0.0, 0.0, 1.0])

        return np.concatenate([base, np.array(enemy_feats, dtype=np.float32).reshape(1, -1)], axis=1)


class PolicyNet(nn.Module):
    def __init__(self, input_dim: int, hidden=128, num_actions=5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden), nn.ReLU(),
            nn.Linear(hidden, hidden),    nn.ReLU(),
            nn.Linear(hidden, num_actions)
        )
        self.value_head = nn.Linear(hidden, 1)

    def forward(self, x):
        h = self.net[:-1](x)  # to последнего Linear        logits = self.net[-1](h)
        value  = self.value_head(h)
        return logits, value

class NetAgentShared:
    """
    Shared-MLP for N hunterов: один and тот же набор весов,
    батчим фичи по всем predatorам and получаем список actions.
    Добавлен небольшой BC-replay for устойчивости на малой бете.
    """
    def __init__(self, feature_builder: FeatureBuilder, num_actions=5, lr=LR, device=None, team_size=5):
        self.f = feature_builder
        self.team_size = team_size
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self._input_dim = None
        self.model = None
        self.optimizer = None
        self.ce = nn.CrossEntropyLoss(label_smoothing=0.05)

        # --- простой replay ---        self.rb_feats  = deque(maxlen=50000)  # храним поштучно: [D]        self.rb_labels = deque(maxlen=50000)  # int        self._rng = np.random.default_rng(0)

    def _ensure_model(self, sample_feats: np.ndarray, num_actions=5):
        if self.model is not None:
            return
        D = sample_feats.shape[1]
        self._input_dim = D
        self.model = PolicyNet(D, hidden=128, num_actions=num_actions).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=LR)

    def _feats_all_preds(self, info: dict, world_map) -> np.ndarray:
        feats_list = [self.f(info, world_map, pred_idx=k) for k in range(self.team_size)]
        return np.concatenate(feats_list, axis=0)  # [K, D]
    def get_actions(self, info: dict, world_map, training=False, greedy=False):
        feats = self._feats_all_preds(info, world_map)   # [K, D]        self._ensure_model(feats[:1, :])
        x = torch.tensor(feats, dtype=torch.float32, device=self.device)
        logits, values = self.model(x)  # [K, A], [K,1]    
        # --- СФОРМИРУЕМ МАСКУ available actions For КАЖДОГО predatorА ---        # действия: 0=стоим, 1=вправо(x+1), 2=влево(x-1), 3=вверх(y-1), 4=вниз(y+1)        wm = np.array(world_map)
        if wm.ndim == 3:
            walls = (wm[:, :, 0] == -1) & (wm[:, :, 1] == -1)
        else:
            walls = (wm == -1)
        H, W = walls.shape
        preds = info.get("predators", [])
        # prev позиции на случай запрета flipflop/stay (опционально)        prev_preds = None
        # (if хочешь: можно передать prev_info сюда; сейчас ограничимся стенами)    
        # mask: 1 = можно, 0 = нельзя        # --- torus-mask available actions (шагаем сквозь края maps) ---        wm = np.array(world_map)
        if wm.ndim == 3:
            walls = (wm[:, :, 0] == -1) & (wm[:, :, 1] == -1)
        else:
            walls = (wm == -1)
        H, W = walls.shape
        preds = info.get("predators", [])

        def passable(y, x):
            return (not walls[y % H, x % W])

        mask = torch.ones_like(logits, dtype=torch.bool)  # [K,5]        for k, pr in enumerate(preds):
            y, x_ = int(pr["y"]), int(pr["x"])
            # вправо (x+1), влево (x-1), вверх (y-1), вниз (y+1) — всё по модулю            if not passable(y, x_ + 1): mask[k, 1] = False
            if not passable(y, x_ - 1): mask[k, 2] = False
            if not passable(y - 1, x_): mask[k, 3] = False
            if not passable(y + 1, x_): mask[k, 4] = False

        # if все движения запрещены — оставим хотя бы "стоим"        all_forbidden = (~mask[:, 1:]).all(dim=1)
        mask[all_forbidden, 0] = True

        # применим маску к логитам        masked_logits = logits.masked_fill(~mask, -1e9)
    
        if greedy or not training:
            a = torch.argmax(masked_logits, dim=-1)  # [K]            actions = [int(i) for i in a.detach().cpu().tolist()]
            # заглушки            logprobs = [torch.tensor(0.0, device=self.device) for _ in range(self.team_size)]
            ents     = [torch.tensor(0.0, device=self.device) for _ in range(self.team_size)]
            vals     = [v.squeeze(-1) for v in values]  # [K,1] -> списком тензоров скаляров            return actions, logprobs, vals, ents
        else:
            probs = torch.softmax(masked_logits, dim=-1)
            probs = probs / probs.sum(dim=-1, keepdim=True).clamp_min(1e-8)
            dist  = torch.distributions.Categorical(probs)
            at    = dist.sample()
            actions = [int(i) for i in at.detach().cpu().tolist()]
            logprobs = dist.log_prob(at).unbind()
            ents     = dist.entropy().unbind()
            vals     = [v.squeeze(-1) for v in values]
            return actions, logprobs, vals, ents

    def train_step_bc_multi(self, info: dict, world_map, teacher_actions: list[int]):
        feats = self._feats_all_preds(info, world_map)   # [K, D]        self._ensure_model(feats[:1, :])
        x = torch.tensor(feats, dtype=torch.float32, device=self.device)
        y = torch.tensor([int(a) for a in teacher_actions], dtype=torch.long, device=self.device)  # [K]        logits, _ = self.model(x)
        loss = self.ce(logits, y)
        self.optimizer.zero_grad(set_to_none=True)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
        self.optimizer.step()
        # вернём, чтобы положить в replay        return float(loss.detach().cpu()), feats, [int(a) for a in teacher_actions]

    # --- replay ---    def add_replay(self, feats: np.ndarray, labels: list[int]):
        for i in range(min(len(feats), len(labels))):
            self.rb_feats.append(feats[i].astype(np.float32, copy=True))
            self.rb_labels.append(int(labels[i]))

    def replay_step(self, steps: int = 1, batch: int = 128):
        if len(self.rb_feats) < 512 or self.model is None:
            return 0.0
        losses = []
        for _ in range(steps):
            idx = self._rng.integers(0, len(self.rb_feats), size=min(batch, len(self.rb_feats)))
            x = torch.tensor(np.stack([self.rb_feats[i] for i in idx], axis=0),
                             dtype=torch.float32, device=self.device)
            y = torch.tensor([self.rb_labels[i] for i in idx], dtype=torch.long, device=self.device)
            logits, _ = self.model(x)
            loss = self.ce(logits, y)
            self.optimizer.zero_grad(set_to_none=True)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
            self.optimizer.step()
            losses.append(float(loss.detach().cpu()))
        return float(np.mean(losses)) if losses else 0.0

class AssignedClosestTargetAgent(ClosestTargetAgent):
    """
    Назначаем hunterам уникальные targets (жадный матчинг), но:
    - keep текущее assignment хотя бы hold_steps шагов,
    - меняем target только if новая короче на switch_margin and/or истёк hold.
    Это резко снижает "thrashing" and crowding.
    coordinates — as в субфайлах: state[x,y,*], но у нас всё переводится в линейные индексы.
    """
    def __init__(self, num_predators: int, hold_steps: int = 4, switch_margin: float = 2.0):
        super().__init__(num_predators=num_predators)
        self.hold_steps = int(hold_steps)
        self.switch_margin = float(switch_margin)
        self._assign_tlin = {}   # pred_idx -> t_lin (линейный индекс targets)        self._assign_age  = {}   # pred_idx -> сколько шагов keep эту target
    def reset(self, state, team):
        # важная часть: initialization карт distances у базового агента        super().reset(state, team)
        # сброс нашей памяти        self._assign_tlin.clear()
        self._assign_age.clear()

    def get_actions(self, state, team):
        actions = [0 for _ in range(self.num_predators)]
        predators = {}  # idx -> (x, y)        preys = []

        preys_team = np.max(state[:, :, 0])
        if preys_team == team:
            preys_team = None

        # as в исходнике ClosestTargetAgent (x=строка, y=столбец)        for x in range(state.shape[0]):
            for y in range(state.shape[1]):
                if state[x, y, 0] == team:
                    predators[state[x, y, 1]] = (x, y)
                    continue
                if (preys_team is None and state[x, y, 0] > 0) or (state[x, y, 0] == preys_team):
                    preys.append((x, y))

        if not predators or not preys:
            return actions

        H, W = state.shape[0], state.shape[1]
        alive_tlin = {px * W + py for (px, py) in preys}

        # Собираем все пары (pred, prey) с дистанцией        pairs = []
        for pred_idx, (px, py) in predators.items():
            p_lin = px * W + py
            for (tx, ty) in preys:
                t_lin = tx * W + ty
                d = float(self.distance_map[p_lin, t_lin])
                if np.isfinite(d):
                    pairs.append((pred_idx, p_lin, (tx, ty), t_lin, d))

        if not pairs:
            return actions

        # greedy уникальное assignment (без коллизий)        pairs.sort(key=lambda z: z[4])  # по расстоянию        used_pred, used_tlin = set(), set()
        proposed = {}  # pred_idx -> (p_lin, t_lin, d_new)        for pred_idx, p_lin, (tx, ty), t_lin, d in pairs:
            if pred_idx in used_pred or t_lin in used_tlin:
                continue
            used_pred.add(pred_idx)
            used_tlin.add(t_lin)
            proposed[pred_idx] = (p_lin, t_lin, d)
            if len(proposed) == self.num_predators:
                break

        # Применяем hysteresis: решаем, сохранять старую target or принять новую        chosen = {}  # pred_idx -> (p_lin, t_lin_final)        for pred_idx, (px, py) in predators.items():
            p_lin = px * W + py

            # новая рекомендация матчинга (может отсутствовать)            prop = proposed.get(pred_idx, None)
            new_t_lin = prop[1] if prop is not None else None
            new_d     = prop[2] if prop is not None else np.inf

            # прежнее assignment (if есть and target ещё жива)            old_t_lin = self._assign_tlin.get(pred_idx, None)
            have_old  = (old_t_lin is not None) and (old_t_lin in alive_tlin)
            old_d     = float(self.distance_map[p_lin, old_t_lin]) if have_old else np.inf
            age       = self._assign_age.get(pred_idx, 0)

            keep_old = False
            if have_old:
                # keep хотя бы hold_steps шагов                if age < self.hold_steps:
                    keep_old = True
                else:
                    # можно переключаться, но только if новая target реально лучше                    # "лучше" = короче на switch_margin (or старой дистанции нет)                    if not (np.isfinite(new_d) and (new_d + self.switch_margin < old_d)):
                        keep_old = True

            if keep_old:
                t_lin_final = old_t_lin
                self._assign_age[pred_idx] = age + 1
            else:
                # принимаем новую, if она есть; if нет — падаем на ближайшую из всех                if new_t_lin is not None and np.isfinite(new_d):
                    t_lin_final = new_t_lin
                else:
                    # fallback: ближайшая вообще (as в базовом)                    bestd, best_t = np.inf, None
                    for (tx, ty) in preys:
                        t_lin = tx * W + ty
                        d = float(self.distance_map[p_lin, t_lin])
                        if d < bestd:
                            bestd, best_t = d, t_lin
                    t_lin_final = best_t

                self._assign_tlin[pred_idx] = t_lin_final
                self._assign_age[pred_idx]  = 0

            chosen[pred_idx] = (p_lin, t_lin_final)

        # Первое действие по action_map (as and раньше)        for k, (px, py) in predators.items():
            p_lin, t_lin = chosen.get(k, (px * W + py, None))
            if t_lin is None or not np.isfinite(self.distance_map[p_lin, t_lin]):
                actions[k] = 0  # на всякий случай — стоять            else:
                actions[k] = int(self.action_map[p_lin, t_lin])
        return actions
        
# === ЛЁГКАЯ Model And АГЕНТ (for ограничения по времени) ===class PolicyNetLite(nn.Module):
    """
    lightweight policy: 1 скрытый слой поменьше + value_head (for совместимости интерфейса).
    Входной size такой же, as у тяжёлой модели — это позволяет дистиллировать без смены фич.
    """
    def __init__(self, input_dim: int, hidden=64, num_actions=5):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden)
        self.act = nn.ReLU()
        self.logits = nn.Linear(hidden, num_actions)
        self.value_head = nn.Linear(hidden, 1)

    def forward(self, x):
        h = self.act(self.fc1(x))
        return self.logits(h), self.value_head(h)


class NetAgentSharedLite(NetAgentShared):
    """
    Тот же интерфейс, что у NetAgentShared, но с PolicyNetLite and урезанным реплеем.
    """
    def __init__(self, feature_builder: FeatureBuilder, num_actions=5, lr=LR, device=None, team_size=5):
        super().__init__(feature_builder, num_actions, lr, device, team_size)
        # меньше replay and попроще сглаживание CE        self.rb_feats  = deque(maxlen=20000)
        self.rb_labels = deque(maxlen=20000)
        self.ce = nn.CrossEntropyLoss(label_smoothing=0.03)

    def _ensure_model(self, sample_feats: np.ndarray, num_actions=5):
        if self.model is not None:
            return
        D = sample_feats.shape[1]
        self._input_dim = D
        self.model = PolicyNetLite(D, hidden=64, num_actions=num_actions).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=LR)

## Functions

In [264]:
def _is_wall(raw_map, y, x):
    # 2D: -1 = стена; 3D: канал 0 and 1 равны -1 for walls    if raw_map.ndim == 3:
        return (raw_map[y, x, 0] == -1) and (raw_map[y, x, 1] == -1)
    else:
        return (raw_map[y, x] == -1)

def manhattan_torus(y1, x1, y2, x2, H, W):
    dy = min((y1 - y2) % H, (y2 - y1) % H)
    dx = min((x1 - x2) % W, (x2 - x1) % W)
    return dy + dx

from collections import deque

def shortest_path_grid_from_torus(raw_map, start_yx):
    """
    BFS with toroidal wrapping (4-neighborhood), walls учитываем as обычно.
    Возвращает dist[y,x] (np.inf if недостижимо из-за стен/островов).
    """
    sy, sx = start_yx
    H, W = raw_map.shape[:2]
    dist = np.full((H, W), np.float32(np.inf), dtype=np.float32)
    if _is_wall(raw_map, sy % H, sx % W):
        return dist
    dq = deque()
    dq.append((sy % H, sx % W))
    dist[sy % H, sx % W] = 0.0
    while dq:
        y, x = dq.popleft()
        d = dist[y, x] + 1.0
        for ny, nx in ((y-1, x), (y+1, x), (y, x-1), (y, x+1)):
            ny %= H; nx %= W
            if dist[ny, nx] > d and not _is_wall(raw_map, ny, nx):
                dist[ny, nx] = d
                dq.append((ny, nx))
    return dist

def shortest_path_grid_from(raw_map, start_yx):
    """
    Returns a distance matrix dist[y,x] from start_yx to всех cells (np.inf if недостижимо).
    4-neighborhood, учитывает walls (-1). coordinates [y,x].
    """
    sy, sx = start_yx
    H, W = raw_map.shape[:2]
    dist = np.full((H, W), np.inf, dtype=np.float32)
    if _is_wall(raw_map, sy, sx):
        return dist
    dq = deque()
    dq.append((sy, sx))
    dist[sy, sx] = 0.0
    while dq:
        y, x = dq.popleft()
        d = dist[y, x] + 1.0
        # порядок соседей: U, D, L, R (не принципиально)        for ny, nx in ((y-1, x), (y+1, x), (y, x-1), (y, x+1)):
            if 0 <= ny < H and 0 <= nx < W and dist[ny, nx] > d and not _is_wall(raw_map, ny, nx):
                dist[ny, nx] = d
                dq.append((ny, nx))
    return dist


def extract_toroidal_patch(passable_map: np.ndarray, y: int, x: int, size: int) -> np.ndarray:
    """
    Возвращает локальный патч size×size вокруг [y,x] с torus-зацикливанием по краям.
    Important: coordinates везде [y, x].
    """
    assert size % 2 == 1, "PATCH_SIZE должен быть нечетным"
    H, W = passable_map.shape
    r = size // 2
    patch = np.zeros((size, size), dtype=np.float32)
    for dy in range(-r, r + 1):
        for dx in range(-r, r + 1):
            yy = (y + dy) % H
            xx = (x + dx) % W
            patch[dy + r, dx + r] = passable_map[yy, xx]
    return patch


def shortest_path_dist(raw_map, src_yx, dst_yx):
    """shortest path по 4-соседству с учётом стен (-1). Возвращает np.inf if недостижимо."""
    sy, sx = src_yx
    ty, tx = dst_yx
    if (sy, sx) == (ty, tx):
        return 0
    H, W = raw_map.shape[:2]
    seen = np.zeros((H, W), dtype=bool)
    dq = deque([(sy, sx, 0)])
    seen[sy, sx] = True
    while dq:
        y, x, d = dq.popleft()
        for ny, nx in ((y-1, x), (y+1, x), (y, x-1), (y, x+1)):
            if 0 <= ny < H and 0 <= nx < W and not seen[ny, nx] and not _is_wall(raw_map, ny, nx):
                if (ny, nx) == (ty, tx):
                    return d + 1
                seen[ny, nx] = True
                dq.append((ny, nx, d + 1))
    return np.inf



## Mixed Maps 

In [265]:
def build_singleteam_mixed_loader(
    size=40, preys_num=100, spawn_points=5,
    rocks_grid=None,
    lab_links=None,
    pregenerated_dir=None,
):
    """
    Builds a MixedMapLoader from multiple generators:
      - Rocks: 8 configurations по сетке вероятностей
      - Labyrinth: 8 configurations по числу доп. links/passages
      - (опционально) pregenerated .npy из папки
    """
    loaders = []

    # 8 карт со rocks: proba в духе README (0.01..0.15, доп. 0..0.21)    if rocks_grid is None:
        rocks_grid = [
            (0.01, 0.00), (0.03, 0.05), (0.05, 0.10), (0.07, 0.12),
            (0.09, 0.15), (0.11, 0.18), (0.13, 0.20), (0.15, 0.21),
        ]
    for p, ap in rocks_grid:
        loaders.append(
            SingleTeamRocksMapLoader(
                size=size, preys_num=preys_num, spawn_points=spawn_points,
                rock_spawn_proba=p, additional_rock_spawn_proba=ap
            )
        )

    # 8 карт с labyrinths: from "рыхлых" к "узким"    if lab_links is None:
        # (max, min)        lab_links = [(24,12), (20,10), (16,8), (12,6),
                     (10,4),  (8,3),  (6,2),  (3,1)]
    for lmax, lmin in lab_links:
        loaders.append(
            SingleTeamLabyrinthMapLoader(
                size=size, preys_num=preys_num, spawn_points=spawn_points,
                additional_links_max=lmax, additional_links_min=lmin
            )
        )

    # 4 “hand-crafted” — if есть folder с .npy (не обязательно)    if pregenerated_dir and PregeneratedMapLoader is not None:
        if os.path.isdir(pregenerated_dir) and len(os.listdir(pregenerated_dir)) > 0:
            loaders.append(PregeneratedMapLoader(pregenerated_dir))

    return MixedMapLoader(loaders)

## Visualization Utilities 

In [266]:
def make_color_gif(env_wrapper, save_path, resize_factor=10, fps=8):
    """
    GIF Generation в numpy-ориентации (map[y, x]).
    Без транспонирования and отражений — полностью совпадает с visited_map.
    """
    frames = []
    for frame in env_wrapper.frames:
        h, w = frame.shape
        img = np.zeros((h, w, 3), dtype=np.float32)
        img[frame == -1] = env_wrapper.road_color
        img[frame == -2] = env_wrapper.stone_color
        img[frame == -3] = env_wrapper.bonus_color
        for j in range(env_wrapper.realm.world.playable_teams_num):
            img[frame == j] = env_wrapper.team_colors[j]
        img[frame == env_wrapper.realm.world.playable_teams_num] = env_wrapper.prey_color

        # --- без transpose and flip ---  теперь всё совпадает с visited_map        img = np.repeat(np.repeat(img, resize_factor, axis=0), resize_factor, axis=1)
        img = (img * 255).astype(np.uint8)
        frames.append(cv2.cvtColor(img, cv2.COLOR_RGB2BGR))

    imageio.mimsave(save_path, frames, duration=1/fps)
    print(f"GIF сохранён: {save_path}")


def visualize_visited_map(ep, info, visited_map, predator_path, save_path):
    """
    Heatmap and все объекты в numpy-порядке (y, x).
    styling — as в стабильной версии (больше поля, узкая legend),
    логика совмещения размеров — из исправленной версии.
    """
    # --- 0) Размеры источников ---    Hv, Wv = visited_map.shape

    raw_map = None
    if ("map" in info) and (info["map"] is not None):
        raw_map = np.array(info["map"])
    elif hasattr(env_wrapper.base_env.realm.world, "map"):
        raw_map = np.array(env_wrapper.base_env.realm.world.map)

    if raw_map is not None and raw_map.ndim >= 2:
        Hr, Wr = raw_map.shape[:2]
    else:
        Hr, Wr = Hv, Wv

    # Общая отображаемая область = пересечение размеров    dispH, dispW = min(Hv, Hr), min(Wv, Wr)
    vmap = visited_map[:dispH, :dispW]

    # --- 1) Фигура and компоновка (чуть больше теплокарта + узкая цветовая scale) ---    plt.style.use("dark_background")
    long_side = max(dispH, dispW)
    scale = max(6.8, min(9.0, 6.2 * long_side / 20.0))  # слегка больше 6", адаптивно from размера maps
    fig = plt.figure(figsize=(scale, scale), facecolor="black")
    gs = fig.add_gridspec(nrows=1, ncols=2, width_ratios=[1.0, 0.045], wspace=0.035)

    ax = fig.add_subplot(gs[0, 0])
    cax = fig.add_subplot(gs[0, 1])

    # --- 2) Тепловая подложка ---    im = ax.imshow(vmap, cmap="plasma", origin="upper", interpolation="nearest")
    cbar = fig.colorbar(im, cax=cax)
    cbar.set_label("Посещения клетки",fontsize = 14)
    cax.set_facecolor("black")
    cax.tick_params(colors="white", length=0)
    for spine in cax.spines.values():
        spine.set_color("black")

    # --- 3) Геометрия coordinates ---    ax.set_xlim(-0.5, dispW - 0.5)
    ax.set_ylim(dispH - 0.5, -0.5)  # origin='upper'    ax.set_aspect("equal", adjustable="box")
    ax.autoscale(False)

    # --- 4) walls ---    if raw_map is not None:
        if raw_map.ndim == 3:
            walls_full = np.logical_and(raw_map[:, :, 0] == -1, raw_map[:, :, 1] == -1)
        else:
            walls_full = (raw_map == -1)  # -1 = стена в 2D        walls = walls_full[:dispH, :dispW]
        ax.imshow(np.ma.masked_where(~walls, walls),
                  cmap=plt.cm.Greys_r, alpha=0.85, origin="upper", zorder=1)
        ax.contour(walls.astype(float), levels=[0.5],
                   colors="#00FFFF", linewidths=1.0, alpha=0.8, zorder=2)

    # --- 5) hunter path (с учётом torusа and клипом в display area) ---    if predator_path and len(predator_path) > 1:
        path = [(int(y), int(x))
                for (y, x) in predator_path
                if 0 <= y < dispH and 0 <= x < dispW]
        if len(path) > 1:
            path = np.array(path, dtype=np.int32)
            ys, xs = path[:, 0], path[:, 1]
            seg_y, seg_x = [[ys[0]]], [[xs[0]]]
            for i in range(1, len(xs)):
                dx, dy = abs(xs[i] - xs[i - 1]), abs(ys[i] - ys[i - 1])
                if dx > dispW // 2 or dy > dispH // 2:
                    seg_y.append([ys[i]]); seg_x.append([xs[i]])
                else:
                    seg_y[-1].append(ys[i]); seg_x[-1].append(xs[i])
            for sy, sx in zip(seg_y, seg_x):
                if len(sx) > 1:
                    ax.plot(sx, sy, color="#FFFFFF", linewidth=1.7, alpha=0.9, zorder=5, clip_on=True)

    # --- 6) preys ---    alive, dead = [], []
    if "preys" in info:
        for p in info["preys"]:
            y, x = int(p.get("y", -1)), int(p.get("x", -1))
            if 0 <= y < dispH and 0 <= x < dispW:
                (alive if p.get("alive", False) else dead).append((y, x))

    if alive:
        ys, xs = zip(*alive)
        ax.scatter(xs, ys, c="#7CFC00", marker="o", s=75,
                   edgecolors="black", linewidths=0.5, zorder=6, clip_on=True)
    if dead:
        ys, xs = zip(*dead)
        ax.scatter(xs, ys, c="#FF6347", marker="X", s=95,
                   edgecolors="black", linewidths=0.6, zorder=6, clip_on=True)

    # --- 7) Финальная позиция hunterа ---    if "predators" in info and len(info["predators"]) > 0:
        pr = info["predators"][0]
        py, px = int(pr.get("y", -1)), int(pr.get("x", -1))
        if 0 <= py < dispH and 0 <= px < dispW:
            ax.scatter(px, py, c="#00FFFF", marker="D", s=135,
                       edgecolors="black", linewidths=0.6, zorder=7, clip_on=True)

    # --- 8) styling ---    ax.set_title(
        f"Эпизод {ep} — исследовано: {int(np.count_nonzero(vmap))} клеток",
        fontsize=15, color="#EEEEEE", pad=10
    )
    ax.grid(True, alpha=0.18, color="#444444", linestyle=":")
    ax.set_axis_off()

    # Компактные поля and сохранение    fig.subplots_adjust(left=0.02, right=0.985, top=0.93, bottom=0.02)
    fig.savefig(save_path, dpi=220, bbox_inches="tight", facecolor="black", pad_inches=0.02)
    plt.close(fig)
    print(f"Карта посещений сохранена: {save_path}")
    

def visualize_team_map(ep, info, visited_map_team, predator_paths, save_path):
    """
    Team Visit Heatmap + треки всех hunterов.
    - visited_map_team: HxW, инкрементим на каждом шаге for КАЖДОГО hunterа
    - predator_paths: список из K списков [(y,x), ...] for каждого hunterа
    """
    # 0) размеры    Hv, Wv = visited_map_team.shape

    raw_map = None
    if ("map" in info) and (info["map"] is not None):
        raw_map = np.array(info["map"])
    elif hasattr(env_wrapper.base_env.realm.world, "map"):
        raw_map = np.array(env_wrapper.base_env.realm.world.map)

    if raw_map is not None and raw_map.ndim >= 2:
        Hr, Wr = raw_map.shape[:2]
    else:
        Hr, Wr = Hv, Wv

    dispH, dispW = min(Hv, Hr), min(Wv, Wr)
    vmap = visited_map_team[:dispH, :dispW]

    # цвета for треков hunterов (0..4)    track_colors = ["#00FFFF", "#FF8C00", "#32CD32", "#FF1493", "#FFD700"]

    # 1) фигура    plt.style.use("dark_background")
    long_side = max(dispH, dispW)
    scale = max(7.0, min(10.0, 6.8 * long_side / 20.0))
    fig = plt.figure(figsize=(scale, scale), facecolor="black")
    gs = fig.add_gridspec(nrows=1, ncols=2, width_ratios=[1.0, 0.045], wspace=0.035)
    ax = fig.add_subplot(gs[0, 0])
    cax = fig.add_subplot(gs[0, 1])

    # 2) тепловая подложка visits КОМАНДОЙ    im = ax.imshow(vmap, cmap="plasma", origin="upper", interpolation="nearest")
    cbar = fig.colorbar(im, cax=cax)
    cbar.set_label("Посещения (команда)", fontsize=14)
    cax.set_facecolor("black")
    cax.tick_params(colors="white", length=0)
    for spine in cax.spines.values():
        spine.set_color("black")

    # 3) walls    if raw_map is not None:
        if raw_map.ndim == 3:
            walls_full = np.logical_and(raw_map[:, :, 0] == -1, raw_map[:, :, 1] == -1)
        else:
            walls_full = (raw_map == -1)
        walls = walls_full[:dispH, :dispW]
        ax.imshow(np.ma.masked_where(~walls, walls),
                  cmap=plt.cm.Greys_r, alpha=0.85, origin="upper", zorder=1)
        ax.contour(walls.astype(float), levels=[0.5],
                   colors="#00FFFF", linewidths=1.0, alpha=0.6, zorder=2)

    # 4) пути всех hunterов (с разрывами по torusу)    for k, pathk in enumerate(predator_paths):
        if not pathk or len(pathk) < 2:
            continue
        path = [(int(y), int(x)) for (y, x) in pathk if 0 <= y < dispH and 0 <= x < dispW]
        if len(path) < 2:
            continue
        path = np.array(path, dtype=np.int32)
        ys, xs = path[:, 0], path[:, 1]
        seg_y, seg_x = [[ys[0]]], [[xs[0]]]
        for i in range(1, len(xs)):
            dx, dy = abs(xs[i] - xs[i-1]), abs(ys[i] - ys[i-1])
            if dx > dispW // 2 or dy > dispH // 2:
                seg_y.append([ys[i]]); seg_x.append([xs[i]])
            else:
                seg_y[-1].append(ys[i]); seg_x[-1].append(xs[i])
        for sy, sx in zip(seg_y, seg_x):
            if len(sx) > 1:
                ax.plot(sx, sy, color=track_colors[k % len(track_colors)],
                        linewidth=1.7, alpha=0.95, zorder=5, clip_on=True)

        # пометки старта/финиша hunterа k        sy0, sx0 = path[0]
        sy1, sx1 = path[-1]
        ax.scatter([sx0], [sy0], c=track_colors[k % len(track_colors)],
                   marker="o", s=55, edgecolors="black", linewidths=0.6, zorder=6)
        ax.scatter([sx1], [sy1], c=track_colors[k % len(track_colors)],
                   marker="D", s=85, edgecolors="black", linewidths=0.7, zorder=7)
        ax.text(sx1 + 0.3, sy1 - 0.3, f"{k}", color=track_colors[k % len(track_colors)],
                fontsize=12, weight="bold", zorder=8)

    # 5) preys    alive, dead = [], []
    if "preys" in info:
        for p in info["preys"]:
            y, x = int(p.get("y", -1)), int(p.get("x", -1))
            if 0 <= y < dispH and 0 <= x < dispW:
                (alive if p.get("alive", False) else dead).append((y, x))
    if alive:
        ys, xs = zip(*alive)
        ax.scatter(xs, ys, c="#7CFC00", marker="o", s=70,
                   edgecolors="black", linewidths=0.5, zorder=6, clip_on=True, label="alive")
    if dead:
        ys, xs = zip(*dead)
        ax.scatter(xs, ys, c="#FF6347", marker="X", s=90,
                   edgecolors="black", linewidths=0.6, zorder=6, clip_on=True, label="caught")

    # 6) styling    ax.set_xlim(-0.5, dispW - 0.5)
    ax.set_ylim(dispH - 0.5, -0.5)
    ax.set_aspect("equal", adjustable="box")
    ax.autoscale(False)
    explored = int(np.count_nonzero(vmap))
    caught_total = sum(1 for p in info.get("preys", []) if not p.get("alive", True))
    ax.set_title(
        f"Эпизод {ep} — Поймано: {caught_total} из {len(info.get('preys', []))}",
        fontsize=15, color="#EEEEEE", pad=10
    )
    ax.grid(True, alpha=0.18, color="#444444", linestyle=":")
    ax.set_axis_off()

    fig.subplots_adjust(left=0.02, right=0.985, top=0.93, bottom=0.02)
    fig.savefig(save_path, dpi=220, bbox_inches="tight", facecolor="black", pad_inches=0.02)
    plt.close(fig)
    print(f"Командная карта посещений сохранена: {save_path}")

def display_side_by_side(gif_path, heatmap_path, width=256):
    display(HTML(f"""
    <div style="display:flex; align-items:flex-start; gap:8px; justify-content:center;">
        <div style="display:flex; flex-direction:column; align-items:center;">
            <img src="{gif_path}" width="{width}" style="border:1px solid #444; image-rendering:pixelated; aspect-ratio:1/1;">        </div>
        <div style="display:flex; flex-direction:column; align-items:center;">
            <img src="{heatmap_path}" width="{width}" style="border:1px solid #444; aspect-ratio:1/1;">        </div>
    </div>
    """))

## Награда and логирование

In [267]:
def _team_potential_preys(info, H, W):
    """
    Потенциал-команда: сумма torus-Manhattanских distances from КАЖДОГО нашего predatorа
    to ближайшей живой preys (if нет жертв — 0). Чем меньше — тем лучше.
    """
    preds = info.get("predators", [])
    preys = [q for q in info.get("preys", []) if q.get("alive", False)]
    if not preds or not preys:
        return 0.0

    pot = 0.0
    for pr in preds:
        py, px = int(pr["y"]), int(pr["x"])
        best = np.inf
        for q in preys:
            qy, qx = int(q["y"]), int(q["x"])
            d = manhattan_torus(py, px, qy, qx, H, W)
            if d < best:
                best = d
        pot += (0.0 if not np.isfinite(best) else float(best))
    return float(pot)

def compute_reward_static(prev_info, info, caught_preys,
                          step_idx, episode_idx, visited_map, phase,
                          actions_exec=None, actions_teacher=None):
    """
    Возвращает: rewards[K], caught_preys
    Логирование сохранено (одна строка на predatorа).
    """
    import numpy as np

    preds = info.get("predators", [])
    K = len(preds)
    rewards = np.zeros((K,), dtype=np.float32)

    # базовый небольшой penalty всем    r_base = -0.01
    rewards += r_base

    # поимки на шаге (делим на всех)    prev_alive = {(p["y"], p["x"]) for p in prev_info["preys"] if p["alive"]} if prev_info is not None else set()
    curr_alive = {(p["y"], p["x"]) for p in info["preys"]  if p["alive"]}
    new_caught = prev_alive - curr_alive
    new_caught -= caught_preys
    r_capture_each = np.zeros((K,), dtype=np.float32)
    if new_caught:
        total_bonus = 10.0 * len(new_caught)
        r_capture_each += (total_bonus / max(1, K))
        rewards += r_capture_each
        caught_preys |= new_caught

    # подготовка maps    raw_map = None
    if ("map" in info) and (info["map"] is not None):
        raw_map = np.array(info["map"])
    elif "env_wrapper" in globals() and hasattr(env_wrapper.base_env.realm.world, "map"):
        raw_map = np.array(env_wrapper.base_env.realm.world.map)
    if raw_map is None:
        H = W = 0
    else:
        H, W = raw_map.shape[:2]

    # потенциалы команды to/после    pot_prev = _team_potential_preys(prev_info, H, W) if (raw_map is not None and prev_info is not None) else 0.0
    pot_curr = _team_potential_preys(info,     H, W) if (raw_map is not None) else 0.0

    # вспомогательные    alive_preys_list = [q for q in info.get("preys", []) if q.get("alive", False)]
    prev_preds = prev_info.get("predators", []) if prev_info is not None else []
    curr_coords = [(int(pr["y"]), int(pr["x"])) for pr in preds]
    curr_dirs = {}

    # компоненты-логи    r_explore_each = np.zeros((K,), dtype=np.float32)
    r_revisit_each = np.zeros((K,), dtype=np.float32)
    r_stand_each   = np.zeros((K,), dtype=np.float32)
    r_bfs_each     = np.zeros((K,), dtype=np.float32)

    # динамика repulsionа    alive_cnt = sum(1 for p in info.get("preys", []) if p.get("alive", False))
    total_cnt = max(1, len(info.get("preys", [])))
    repulse_scale = 1.0 + repulse_boost * (1.0 - (alive_cnt / total_cnt))

    # ход по каждому predatorу    for k, pr in enumerate(preds):
        py, px = int(pr["y"]), int(pr["x"])

        # стояние and направление        if prev_preds and k < len(prev_preds):
            py0, px0 = int(prev_preds[k]["y"]), int(prev_preds[k]["x"])
            if (py0, px0) == (py, px):
                r_stand_each[k] = stand_still_penalty
                rewards[k] -= r_stand_each[k]
            curr_dirs[k] = (py - py0, px - px0)

        # исследование / revisit        if 0 <= py and 0 <= px and visited_map.shape[0] > py and visited_map.shape[1] > px:
            if visited_map[py, px] < 3:
                r = exploration_coef / (1 + visited_map[py, px])
                if step_idx < 20:
                    r *= 1.4
                r_explore_each[k] = r
                rewards[k] += r_explore_each[k]
            visited_map[py, px] += 1
            if visited_map[py, px] > 5:
                r_revisit_each[k] = revisit_penalty
                rewards[k] -= r_revisit_each[k]

        # Δдистанции to ближайшей preys (torus-L1)        if alive_preys_list and raw_map is not None:
            best_d = float("inf"); best_yx = None
            for q in alive_preys_list:
                qy, qx = int(q["y"]), int(q["x"])
                d = manhattan_torus(py, px, qy, qx, H, W)
                if d < best_d:
                    best_d = d; best_yx = (qy, qx)
            if prev_preds and k < len(prev_preds) and best_yx is not None and np.isfinite(best_d):
                py0, px0 = int(prev_preds[k]["y"]), int(prev_preds[k]["x"])
                old_d = manhattan_torus(py0, px0, best_yx[0], best_yx[1], H, W)
                new_d = best_d
                r_bfs_each[k] = shaping_coef * float(old_d - new_d)
                rewards[k] += r_bfs_each[k]

        # bonus — только if не вредим прогрессу (pot не вырос)        if raw_map is not None and 0 <= py < H and 0 <= px < W:
            is_bonus = False
            if raw_map.ndim == 3:
                is_bonus = (raw_map[py, px, 0] == -3) or (raw_map[py, px, -1] == -3)
            else:
                is_bonus = (raw_map[py, px] == -3)
            if is_bonus:
                key = (py, px)
                if key not in BONUS_TAKEN and (pot_curr <= pot_prev + 1e-6):
                    late_scale = 1.0 + 0.5*(1.0 - alive_cnt/total_cnt)
                    rewards[k] += (bonus_reward_base * late_scale)
                    BONUS_TAKEN.add(key)

    # repulsion/стадность (после ранней фазы) с кэпом    r_repulse_total = np.zeros((K,), dtype=np.float32)
    if step_idx >= REPULSE_KICKS_IN_AFTER:
        for i in range(K):
            yi, xi = curr_coords[i]
            for j in range(i + 1, K):
                yj, xj = curr_coords[j]
                d = abs(yi - yj) + abs(xi - xj)
                if d == 0:
                    r = repulse_same_cell * repulse_scale
                elif d == 1:
                    r = repulse_adjacent * repulse_scale
                elif d == 2:
                    r = repulse_radius2 * repulse_scale
                else:
                    r = 0.0
                if r > 0:
                    r_repulse_total[i] += r; r_repulse_total[j] += r
        r_repulse_total = np.minimum(r_repulse_total, REPULSE_CAP_PER_AGENT)
        rewards -= r_repulse_total

    # anti-flipflop / same-dir-close — только if нет прогресса по потенциалу    r_flipflop_each = np.zeros((K,), dtype=np.float32)
    r_same_dir_close_each = np.zeros((K,), dtype=np.float32)
    progress_ok = (pot_curr <= pot_prev + 1e-6)
    if prev_preds and not progress_ok:
        for k in range(K):
            dy, dx = curr_dirs.get(k, (0, 0))
            last = LAST_DIRS.get(k, (0, 0))
            if (dy, dx) != (0, 0) and (dy == -last[0] and dx == -last[1]):
                r_flipflop_each[k] = flipflop_penalty
        rewards -= r_flipflop_each

        for i in range(K):
            yi, xi = curr_coords[i]; vi = curr_dirs.get(i, (0, 0))
            for j in range(i + 1, K):
                yj, xj = curr_coords[j]
                if abs(yi - yj) + abs(xi - xj) <= 2:
                    vj = curr_dirs.get(j, (0, 0))
                    if vi != (0, 0) and vi == vj:
                        r_same = same_dir_close_penalty * 0.5
                        r_same_dir_close_each[i] += r_same
                        r_same_dir_close_each[j] += r_same
        rewards -= r_same_dir_close_each

    # обновить кэш направлений    for k, v in curr_dirs.items():
        LAST_DIRS[k] = v

    # === ЛОГИ (as у тебя) ===    idle_preds = 0
    if prev_preds:
        prev_coords = [(int(pr["y"]), int(pr["x"])) for pr in prev_preds]
        for (py0, px0), (py1, px1) in zip(prev_coords, curr_coords):
            if (py0, px0) == (py1, px1):
                idle_preds += 1

    pair_d0 = 0; pair_d1 = 0; sum_d = 0.0; pairs = 0
    for i in range(K):
        for j in range(i + 1, K):
            yi, xi = curr_coords[i]; yj, xj = curr_coords[j]
            d = abs(yi - yj) + abs(xi - xj)
            if d == 0: pair_d0 += 1
            elif d == 1: pair_d1 += 1
            sum_d += d; pairs += 1
    team_disp = (sum_d / max(1, pairs)) if pairs else 0.0

    alive_preys_cnt = sum(1 for q in info.get("preys", []) if q.get("alive", False))
    prev_score = (prev_info["scores"][0] if (prev_info is not None and "scores" in prev_info) else 0)
    curr_score = (info["scores"][0] if "scores" in info else 0)
    score_delta = curr_score - prev_score

    exec_list  = list(actions_exec)    if actions_exec    is not None else [-1] * K
    teach_list = list(actions_teacher) if actions_teacher is not None else [-1] * K

    for k, pr in enumerate(preds):
        py, px = int(pr["y"]), int(pr["x"])
        # ближайшая жертва for логов        ny = nx = -1; dy = dx = 0.0
        best_d = float("inf")
        for q in alive_preys_list:
            qy, qx = int(q["y"]), int(q["x"])
            d = manhattan_torus(py, px, qy, qx, H, W)
            if d < best_d:
                best_d = d; ny, nx = qy, qx; dy = float(ny - py); dx = float(nx - px)

        STEP_LOG_BUFFER.append([
            phase, int(episode_idx), int(step_idx),
            int(k), int(py), int(px),
            int(exec_list[k]) if k < len(exec_list) else -1,
            int(teach_list[k]) if k < len(teach_list) else -1,
            int(alive_preys_cnt), int(len(caught_preys)), int(len(new_caught)),
            float(curr_score), float(score_delta),
            float(rewards[k]), float(r_base),
            float(r_capture_each[k] if k < len(r_capture_each) else 0.0),
            float(r_explore_each[k]), float(r_stand_each[k]), float(r_revisit_each[k]),
            float(r_bfs_each[k]), float(r_repulse_total[k]),
            float(r_flipflop_each[k]), float(r_same_dir_close_each[k]),
            int(idle_preds), int(pair_d0), int(pair_d1), float(team_disp),
            int(ny), int(nx), float(dy), float(dx),
            list(new_caught)
        ])

    return rewards, caught_preys


def compute_reward_pvp(prev_info, info, caught_preys, step_idx, episode_idx, visited_map, phase,
                       actions_exec=None, actions_teacher=None):
    """
    PvP-надстройка поверх базовых наград.
    """
    rewards, caught_preys = compute_reward_static(prev_info, info, caught_preys,
                                                  step_idx, episode_idx, visited_map, phase,
                                                  actions_exec, actions_teacher)
    preds = info.get("predators", [])
    enemies = info.get("enemies", []) or info.get("enemy", [])
    if not preds or not enemies:
        return rewards, caught_preys

    raw_map = None
    if ("map" in info) and (info["map"] is not None):
        raw_map = np.array(info["map"])
    elif "env_wrapper" in globals() and hasattr(env_wrapper.base_env.realm.world, "map"):
        raw_map = np.array(env_wrapper.base_env.realm.world.map)
    if raw_map is None:
        return rewards, caught_preys
    H, W = raw_map.shape[:2]

    # перехват противника: сокращаем torus-дистанцию to ближ. врага    r_intercept = 0.005
    prev_preds = prev_info.get("predators", []) if prev_info is not None else []
    for k, pr in enumerate(preds):
        py, px = int(pr["y"]), int(pr["x"])
        best_d = min(manhattan_torus(py, px, int(en["y"]), int(en["x"]), H, W) for en in enemies)
        if prev_preds and k < len(prev_preds):
            py0, px0 = int(prev_preds[k]["y"]), int(prev_preds[k]["x"])
            best_d0 = min(manhattan_torus(py0, px0, int(en["y"]), int(en["x"]), H, W) for en in enemies)
            if np.isfinite(best_d) and np.isfinite(best_d0):
                rewards[k] += r_intercept * float(best_d0 - best_d)

    # гонка к нашей ближайшей жертве: плохо, if враг явнo ближе    r_race_penalty = 0.004
    alive_preys = [q for q in info.get("preys", []) if q.get("alive", False)]
    for k, pr in enumerate(preds):
        py, px = int(pr["y"]), int(pr["x"])
        my_best = (float("inf"), None)
        for q in alive_preys:
            qy, qx = int(q["y"]), int(q["x"])
            d = manhattan_torus(py, px, qy, qx, H, W)
            if d < my_best[0]:
                my_best = (d, (qy, qx))
        if my_best[1] is None:
            continue
        qy, qx = my_best[1]
        enemy_best = min(manhattan_torus(int(en["y"]), int(en["x"]), qy, qx, H, W) for en in enemies)
        if np.isfinite(my_best[0]) and np.isfinite(enemy_best) and (enemy_best + 1 < my_best[0]):
            rewards[k] -= r_race_penalty

        # маленькое «gate-keeping»: стоим у preys ближе, чем враг        r_gate_keep = 0.003
        if my_best[0] <= 1 and enemy_best > my_best[0] + 0.5:
            rewards[k] += r_gate_keep

    return rewards, caught_preys

## PvP окружение против бота ClosestTarget 

In [268]:
# === PvP окружение против бота ClosestTarget на МИКСЕ карт ===# Использует VersusBotEnv, чтобы шагать ОДНИМ списком actions (бот живёт в realm.bots)BEST_PVP = None
TEAM_SIZE = 5
BEST_PVP_EVAL = None

## Functions обучения 

In [269]:
# @torch.no_grad()def train_dagger_multi(agent: NetAgentShared, env_wrapper,
                       episodes=24,
                       beta_start=0.90, beta_end=0.05, render_every=6,
                       seed_base: int = 12345):
    """
    DAgger на смешанных картах (solo):
      • sticky-DAgger (учитель/студент),
      • τ-врата уверенности + короткий action-repeat,
      • лёгкая стохастика ученика (EPS),
      • anti-разворот на первом тике после отхода from учителя,
      • anti-idle + action-mask + anti «одна сtorusона»,
      • логи поведенческих метрик.
    Учитель используется только as оракул (лейблы).
    """
    import inspect
    teacher = AssignedClosestTargetAgent(num_predators=agent.team_size)

    beta_decay = 1.0 if episodes <= 1 else (beta_end / beta_start) ** (1.0 / (episodes - 1))
    beta = beta_start

    DIRS = {0:(0,0), 1:(0,1), 2:(0,-1), 3:(-1,0), 4:(1,0)}  # 0=stay,1=→,2=←,3=↑,4=↓
    pbar = tqdm(range(episodes), desc="DAgger-5", leave=True)
    for ep in pbar:
        # сброс кэша направлений (for flip/same) and bonusов        global LAST_DIRS, BONUS_TAKEN
        LAST_DIRS = {}
        BONUS_TAKEN = set()

        # новая map/сид        seed_ep = seed_base + ep
        random.seed(seed_ep); np.random.seed(seed_ep)

        base_env = getattr(env_wrapper, "base_env", None)
        can_seed_base = False
        if base_env is not None and hasattr(base_env, "reset"):
            try:
                if "seed" in inspect.signature(base_env.reset).parameters:
                    state, info = base_env.reset(seed=seed_ep)
                    env_wrapper.frames = []
                    env_wrapper.last_info = info
                    can_seed_base = True
            except Exception:
                can_seed_base = False
        if not can_seed_base:
            state, info = env_wrapper.reset()

        teacher.reset(state, team=0)

        # map/passability (фиксируем один раз; if у тебя map статична в эпизоде — этого достаточно)        world_map = env_wrapper.base_env.realm.world.map
        wm_np = np.array(world_map)
        if wm_np.ndim == 3:
            walls = (wm_np[:, :, 0] == -1) & (wm_np[:, :, 1] == -1)
        else:
            walls = (wm_np == -1)
        H, W = walls.shape
        def passable(y, x): return not walls[y % H, x % W]

        # аккумулируем визиты/пути        visited_map_ep   = np.zeros((H, W), dtype=np.int32)
        visited_map_team = np.zeros((H, W), dtype=np.int32)
        predator_paths   = [[] for _ in range(agent.team_size)]
        caught_preys     = set()

        # sticky/τ/repeat/idle        STICK_K  = 4
        TAU_CONF = 0.60
        KEEP_K   = 1
        src_hold = np.zeros(agent.team_size, dtype=np.int32)
        src_is_teacher = np.ones(agent.team_size, dtype=bool)
        keep_hold = np.zeros(agent.team_size, dtype=np.int32)
        prev_exec_actions = [0] * agent.team_size
        stay_streak = [0] * agent.team_size

        # поведенческие метрики        cluster_steps = idle_steps = 0
        max_idle_streak = _idle_streak = 0
        prev_pred_coords = None
        flip_events = same_dir_events = 0
        last_dirs_ep = [(0,0)] * agent.team_size

        # anti-«одна сtorusона»        repeat_cnt = [0] * agent.team_size
        last_action_seen = [None] * agent.team_size

        bc_losses = []
        first_capture_step = None

        # утилиты под расстояния to ближайшей targets        def nearest_prey_dist(py, px):
            preys_alive = [q for q in info["preys"] if q.get("alive", False)]
            if not preys_alive: 
                return 0.0
            return min(manhattan_torus(py, px, int(t["y"]), int(t["x"]), H, W) for t in preys_alive)

        def best_step_toward_nearest(k: int) -> int:
            """Жадно сокращаем torus-L1 to ближайшей живой preys из ДОПУСТИМЫХ ходов."""
            preys_alive = [q for q in info["preys"] if q.get("alive", False)]
            if not preys_alive:
                return 0
            py = int(info["predators"][k]["y"]); px = int(info["predators"][k]["x"])
            best_a, best_d = 0, float("inf")
            for a, (dy, dx) in DIRS.items():
                ny, nx = (py + dy) % H, (px + dx) % W
                if not passable(ny, nx): 
                    continue
                d = min(manhattan_torus(ny, nx, int(t["y"]), int(t["x"]), H, W) for t in preys_alive)
                if d < best_d:
                    best_d, best_a = d, a
            return best_a

        # основной цикл эпизода        for step in range(env_wrapper.base_env.realm.step_limit):
            # 1) действия учителя            t_actions = teacher.get_actions(state, team=0)

            # 2) BC            loss, feats_batch, y_batch = agent.train_step_bc_multi(info, world_map, t_actions)
            bc_losses.append(loss)
            agent.add_replay(feats_batch, y_batch)
            if beta > 0.20 or (step % 5) == 0:
                agent.replay_step(steps=1, batch=128)

            # 3) микширование            exec_actions = [0] * agent.team_size

            # один общий вызов ученика            student_actions = None
            student_logprobs = None
            use_student_mask = (np.random.rand() >= beta)
            if use_student_mask:
                student_actions, student_logprobs, _, _ = agent.get_actions(
                    info, world_map, training=True, greedy=False
                )

            # лёгкая стохастика ученика (exploration)            EPS = 0.03
            if use_student_mask and (np.random.rand() < EPS) and student_actions is not None:
                k_rand = np.random.randint(0, agent.team_size)
                py = int(info["predators"][k_rand]["y"])
                px = int(info["predators"][k_rand]["x"])
                legal = []
                for a, (dy, dx) in DIRS.items():
                    ny, nx = (py + dy) % H, (px + dx) % W
                    if a == 0 or passable(ny, nx):
                        legal.append(a)
                student_actions[k_rand] = int(np.random.choice(legal if legal else list(DIRS.keys())))

            for k in range(agent.team_size):
                if src_hold[k] <= 0:
                    src_is_teacher[k] = (np.random.rand() < beta)
                    src_hold[k] = STICK_K

                if src_is_teacher[k]:
                    a = int(t_actions[k])
                    exec_actions[k] = a
                    keep_hold[k] = 0  # сброс удержания, т.к. действие from учителя                else:
                    if student_actions is None:
                        sa, slp, _, _ = agent.get_actions(info, world_map, training=True, greedy=False)
                        cand = int(sa[k]); lp = float(slp[k].max().detach().cpu().item())
                    else:
                        cand = int(student_actions[k])
                        lp = float(student_logprobs[k].max().detach().cpu().item())

                    prob = float(np.exp(lp))
                    if (prob < TAU_CONF) or (keep_hold[k] > 0):
                        exec_actions[k] = prev_exec_actions[k]
                        keep_hold[k] = max(0, keep_hold[k] - 1)
                    else:
                        exec_actions[k] = cand
                        keep_hold[k] = KEEP_K

                src_hold[k] -= 1

            # 3.1 anti-разворот в момент отхода from учителя            def _is_reverse(v_new, v_last):
                return (v_new[0] == -v_last[0]) and (v_new[1] == -v_last[1]) and (v_new != (0,0))
            for k in range(agent.team_size):
                just_switched = (not src_is_teacher[k]) and (src_hold[k] == STICK_K-1)
                if not just_switched: 
                    continue
                v_last = last_dirs_ep[k] if k < len(last_dirs_ep) else (0,0)
                v_new  = DIRS.get(exec_actions[k], (0,0))
                if _is_reverse(v_new, v_last):
                    alt = int(t_actions[k])
                    if not _is_reverse(DIRS.get(alt,(0,0)), v_last):
                        exec_actions[k] = alt
                        src_is_teacher[k] = True
                        src_hold[k] = max(src_hold[k], 1)
                        keep_hold[k] = 0
                    else:
                        exec_actions[k] = 0
                        keep_hold[k] = 1  # постоим тик
            # 3.2 anti-idle (if стоим ≥3 тиков — форсим альтернативу)            for k in range(agent.team_size):
                if exec_actions[k] == 0:
                    stay_streak[k] += 1
                else:
                    stay_streak[k] = 0

                if stay_streak[k] >= 3:
                    alt = int(t_actions[k]) if int(t_actions[k]) != 0 else best_step_toward_nearest(k)
                    if alt != 0:
                        exec_actions[k] = alt
                        src_is_teacher[k] = True
                        src_hold[k] = max(src_hold[k], 1)
                        keep_hold[k] = 0
                        stay_streak[k] = 0

            # 3.3 ACTION MASK: не шагаем в стену + anti «одна сtorusона»            for k in range(agent.team_size):
                a = int(exec_actions[k])
                dy, dx = DIRS.get(a, (0,0))
                py = int(info["predators"][k]["y"]); px = int(info["predators"][k]["x"])
                ny, nx = (py + dy) % H, (px + dx) % W

                # (а) if step в стену — заменяем (сначала на учителя, else greedy к targets)                if a != 0 and not passable(ny, nx):
                    a_t = int(t_actions[k])
                    dy_t, dx_t = DIRS.get(a_t, (0,0))
                    ny_t, nx_t = (py + dy_t) % H, (px + dx_t) % W
                    if a_t != 0 and passable(ny_t, nx_t):
                        exec_actions[k] = a_t
                        src_is_teacher[k] = True
                        src_hold[k] = max(src_hold[k], 1)
                        keep_hold[k] = 0
                    else:
                        exec_actions[k] = best_step_toward_nearest(k)

                # (б) anti «одна сtorusона»: одно and то же действие N раз без прогресса — nudging                a = int(exec_actions[k])  # мог поменяться                if last_action_seen[k] == a:
                    repeat_cnt[k] += 1
                else:
                    repeat_cnt[k] = 1
                    last_action_seen[k] = a

                N_REPEAT = 6
                if repeat_cnt[k] >= N_REPEAT and a != 0:
                    py = int(info["predators"][k]["y"]); px = int(info["predators"][k]["x"])
                    d0 = nearest_prey_dist(py, px)
                    dy, dx = DIRS.get(a, (0,0))
                    ny, nx = (py + dy) % H, (px + dx) % W
                    d1 = nearest_prey_dist(ny, nx)
                    if not passable(ny, nx) or d1 >= d0:
                        alt = int(t_actions[k])
                        dyt, dxt = DIRS.get(alt, (0,0))
                        nyt, nxt = (py + dyt) % H, (px + dxt) % W
                        if alt != 0 and passable(nyt, nxt):
                            exec_actions[k] = alt
                            src_is_teacher[k] = True
                            src_hold[k] = max(src_hold[k], 1)
                            keep_hold[k] = 0
                        else:
                            exec_actions[k] = best_step_toward_nearest(k)
                        repeat_cnt[k] = 0  # сброс после вмешательства
            # 3.4 зафиксировать прошлые фактические действия (после всех правок)            for k in range(agent.team_size):
                prev_exec_actions[k] = exec_actions[k]

            # 4) step среды            state, done, new_info = env_wrapper.step(exec_actions)

            # --- поведенческие метрики ---            coords = [(int(pr["y"]), int(pr["x"])) for pr in new_info["predators"]]
            idle_now = 0
            if prev_pred_coords is not None:
                for (py0, px0), (py1, px1) in zip(prev_pred_coords, coords):
                    if (py0, px0) == (py1, px1):
                        idle_now += 1
            pair0 = pair1 = 0; sumd = 0.0; pairs = 0
            for i in range(len(coords)):
                for j in range(i + 1, len(coords)):
                    yi, xi = coords[i]; yj, xj = coords[j]
                    d = abs(yi - yj) + abs(xi - xj)
                    if d == 0: pair0 += 1
                    elif d == 1: pair1 += 1
                    sumd += d; pairs += 1
            disp = (sumd / max(1, pairs))
            if (disp <= 1.5) or (pair0 > 0) or (pair1 >= 2):
                cluster_steps += 1
            if idle_now >= (agent.team_size - 1):
                idle_steps += 1; _idle_streak += 1
            else:
                _idle_streak = 0
            max_idle_streak = max(max_idle_streak, _idle_streak)

            # flip/same            dirs_now = []
            if prev_pred_coords is not None:
                for k, (y1, x1) in enumerate(coords):
                    y0, x0 = prev_pred_coords[k]
                    dirs_now.append((y1 - y0, x1 - x0))
                for k, (dy, dx) in enumerate(dirs_now):
                    if (dy, dx) != (0,0) and (dy, dx) == (-last_dirs_ep[k][0], -last_dirs_ep[k][1]):
                        flip_events += 1
                for i in range(len(coords)):
                    for j in range(i+1, len(coords)):
                        d = abs(coords[i][0]-coords[j][0]) + abs(coords[i][1]-coords[j][1])
                        if d <= 2 and i < len(dirs_now) and j < len(dirs_now):
                            if dirs_now[i] != (0,0) and dirs_now[i] == dirs_now[j]:
                                same_dir_events += 1
                for k in range(len(dirs_now)):
                    last_dirs_ep[k] = dirs_now[k]
            else:
                for k in range(len(last_dirs_ep)):
                    last_dirs_ep[k] = (0,0)

            prev_pred_coords = coords

            # 5) треки/посещения            preds = new_info["predators"]
            for k, pr in enumerate(preds):
                py, px = int(pr["y"]), int(pr["x"])
                predator_paths[k].append((py, px))
                if 0 <= py < H and 0 <= px < W:
                    visited_map_team[py, px] += 1

            # 6) награда/логирование (for аналитики)            _, caught_preys = compute_reward_static(
                prev_info=info,
                info=new_info,
                caught_preys=caught_preys,
                step_idx=step,
                episode_idx=ep,
                visited_map=visited_map_ep,
                phase="BC-DAgger",
                actions_exec=exec_actions,
                actions_teacher=t_actions
            )

            # первая поимка            caught_now = sum(1 for p in new_info["preys"] if not p.get("alive", True))
            if first_capture_step is None and caught_now > 0:
                first_capture_step = step

            info = new_info
            if done:
                break

        # конец эпизода        beta = max(beta_end, beta * beta_decay)

        # запись буфера шагов        if STEP_LOG_BUFFER:
            with open(LOG_STEP_PATH, "a", newline="") as f:
                csv.writer(f).writerows(STEP_LOG_BUFFER)
            STEP_LOG_BUFFER.clear()

        caught_total = sum(1 for p in info["preys"] if not p.get("alive", True))
        avg_bc = float(np.mean(bc_losses)) if bc_losses else 0.0

        # визуал по расписанию        if render_every != 0 and ((ep + 1) % render_every == 0 or ep == 0 or ep == episodes - 1):
            gif_path = os.path.join(FRAME_DIR, f"dagger5_ep_{ep:03d}.gif")
            team_map_path = os.path.join(MAP_DIR, f"dagger5_team_ep{ep:03d}.png")
            make_color_gif(env_wrapper, gif_path, resize_factor=10, fps=8)
            visualize_team_map(ep, info, visited_map_team, predator_paths, team_map_path)
            display_side_by_side(gif_path, team_map_path, width=300)

        cluster_pct = cluster_steps / max(1, step + 1)
        idle_pct    = idle_steps    / max(1, step + 1)
        flip_rate   = flip_events   / max(1, step + 1)
        same_rate   = same_dir_events / max(1, step + 1)

        # мягкий decay LR к концу DAgger        if beta <= 0.20:
            for pg in agent.optimizer.param_groups:
                pg["lr"] = max(pg["lr"] * 0.95, LR * 0.3)

        print(f"[DAgger-5] ep={ep:03d} finished at step={step:03d}, beta={beta:.3f}, "
              f"caught={caught_total:03d}, first_cap={first_capture_step}, bc={avg_bc:.4f}, "
              f"cluster={cluster_pct:.2f}, idle={idle_pct:.2f}, flip={flip_rate:.2f}, same={same_rate:.2f}")

        pbar.set_postfix({
            "beta":   f"{beta:.3f}",
            "caught": f"{caught_total:03d}",
            "first":  first_capture_step if first_capture_step is not None else "-",
            "bc":     f"{avg_bc:.3f}",
            "clu":    f"{cluster_pct:.2f}",
            "idle":   f"{idle_pct:.2f}",
            "flip":   f"{flip_rate:.2f}",
            "same":   f"{same_rate:.2f}",
        })


## Функция обучения PvP

In [270]:
def train_dagger_pvp(agent: NetAgentShared, env_wrapper,
                     episodes=60,
                     beta_start=0.60, beta_end=0.05, render_every=10,
                     seed_base: int = 98765):
    """
    DAgger в двухкомандной среде (наш агент vs ClosestTarget на врагах).
    Фичи: sticky-DAgger, τ-врата уверенности, короткий action-repeat,
    anti-разворот, anti-idle, action-mask, лёгкая стохастика ученика.
    Печатает счёт and результат (WIN/DRAW/LOSS) «мы-бот».
    """
    import inspect
    teacher = AssignedClosestTargetAgent(num_predators=agent.team_size)

    beta_decay = 1.0 if episodes <= 1 else (beta_end / beta_start) ** (1.0 / (episodes - 1))
    beta = beta_start

    pbar = tqdm(range(episodes), desc="DAgger-PvP", leave=True)
    for ep in pbar:
        # разнообразие карт/сидов        seed_ep = seed_base + ep
        random.seed(seed_ep); np.random.seed(seed_ep)

        # reset среды с сидом, if поддерживается        base_env = getattr(env_wrapper, "base_env", None)
        did = False
        if base_env is not None and hasattr(base_env, "reset"):
            try:
                if "seed" in inspect.signature(base_env.reset).parameters:
                    state, info = base_env.reset(seed=seed_ep)
                    env_wrapper.frames = []
                    env_wrapper.last_info = info
                    did = True
            except Exception:
                did = False
        if not did:
            state, info = env_wrapper.reset()

        # кэши/глобали        global LAST_DIRS, BONUS_TAKEN
        LAST_DIRS = {}
        BONUS_TAKEN = set()

        teacher.reset(state, team=0)

        world_map = env_wrapper.base_env.realm.world.map
        H, W = np.array(world_map).shape[:2]
        visited_map_ep   = np.zeros((H, W), dtype=np.int32)
        visited_map_team = np.zeros((H, W), dtype=np.int32)
        predator_paths   = [[] for _ in range(agent.team_size)]
        caught_preys     = set()

        # метрики поведения        cluster_steps = idle_steps = 0
        max_idle_streak = _idle_streak = 0
        prev_pred_coords = None
        flip_events = same_dir_events = 0
        last_dirs_ep = [(0,0)] * agent.team_size

        # Sticky/τ/repeat/idle        STICK_K  = 4
        TAU_CONF = 0.60
        KEEP_K   = 1
        src_hold = np.zeros(agent.team_size, dtype=np.int32)
        src_is_teacher = np.ones(agent.team_size, dtype=bool)
        keep_hold = np.zeros(agent.team_size, dtype=np.int32)
        prev_exec_actions = [0] * agent.team_size
        stay_streak = [0] * agent.team_size

        bc_losses = []
        first_capture_step = None

        for step in range(env_wrapper.base_env.realm.step_limit):
            world_map = env_wrapper.base_env.realm.world.map

            # 1) учитель только for наших            t_actions = teacher.get_actions(state, team=0)

            # 2) BC            loss, feats_batch, y_batch = agent.train_step_bc_multi(info, world_map, t_actions)
            bc_losses.append(loss)
            agent.add_replay(feats_batch, y_batch)
            if beta > 0.20 or (step % 5) == 0:
                agent.replay_step(steps=1, batch=128)

            # 3) микширование            exec_actions = [0] * agent.team_size

            # один общий вызов ученика            student_actions = None
            student_logprobs = None
            use_student_mask = (np.random.rand() >= beta)
            if use_student_mask:
                student_actions, student_logprobs, _, _ = agent.get_actions(
                    info, world_map, training=True, greedy=False
                )

            # лёгкая стохастика ученика (exploration)            EPS = 0.03
            DIRS = {0:(0,0), 1:(0,1), 2:(0,-1), 3:(-1,0), 4:(1,0)}  # 0=stay,1=→,2=←,3=↑,4=↓            if use_student_mask and (np.random.rand() < EPS) and student_actions is not None:
                k_rand = np.random.randint(0, agent.team_size)
                try:
                    wm_np = np.array(world_map)
                    walls = (wm_np[:, :, 0] == -1) & (wm_np[:, :, 1] == -1) if wm_np.ndim == 3 else (wm_np == -1)
                    def passable(y, x): return not walls[y % H, x % W]
                    py = int(info["predators"][k_rand]["y"]); px = int(info["predators"][k_rand]["x"])
                    legal = []
                    for a, (dy, dx) in DIRS.items():
                        ny, nx = (py + dy) % H, (px + dx) % W
                        if a == 0 or passable(ny, nx):
                            legal.append(a)
                    student_actions[k_rand] = int(np.random.choice(legal if legal else list(DIRS.keys())))
                except Exception:
                    student_actions[k_rand] = int(np.random.choice(list(DIRS.keys())))

            for k in range(agent.team_size):
                if src_hold[k] <= 0:
                    src_is_teacher[k] = (np.random.rand() < beta)
                    src_hold[k] = STICK_K

                if src_is_teacher[k]:
                    a = int(t_actions[k])
                    exec_actions[k] = a
                    keep_hold[k] = 0
                else:
                    if student_actions is None:
                        sa, slp, _, _ = agent.get_actions(info, world_map, training=True, greedy=False)
                        cand = int(sa[k]); lp = float(slp[k].max().detach().cpu().item())
                    else:
                        cand = int(student_actions[k])
                        lp = float(student_logprobs[k].max().detach().cpu().item())
                    prob = float(np.exp(lp))
                    if (prob < TAU_CONF) or (keep_hold[k] > 0):
                        exec_actions[k] = prev_exec_actions[k]
                        keep_hold[k] = max(0, keep_hold[k] - 1)
                    else:
                        exec_actions[k] = cand
                        keep_hold[k] = KEEP_K

                src_hold[k] -= 1

            # 3.1 anti-разворот в момент отхода from учителя            def _is_reverse(v_new, v_last):
                return (v_new[0] == -v_last[0]) and (v_new[1] == -v_last[1]) and (v_new != (0,0))
            for k in range(agent.team_size):
                just_switched = (not src_is_teacher[k]) and (src_hold[k] == STICK_K-1)
                if not just_switched: 
                    continue
                v_last = last_dirs_ep[k] if k < len(last_dirs_ep) else (0,0)
                v_new  = DIRS.get(exec_actions[k], (0,0))
                if _is_reverse(v_new, v_last):
                    alt = int(t_actions[k])
                    if not _is_reverse(DIRS.get(alt,(0,0)), v_last):
                        exec_actions[k] = alt
                        src_is_teacher[k] = True
                        src_hold[k] = max(src_hold[k], 1)
                        keep_hold[k] = 0
                    else:
                        exec_actions[k] = 0
                        keep_hold[k] = 1

            # 3.2 anti-idle (≥3 стояний подряд)            wm_np = np.array(world_map)
            walls = (wm_np[:, :, 0] == -1) & (wm_np[:, :, 1] == -1) if wm_np.ndim == 3 else (wm_np == -1)
            def passable(y, x): return not walls[y % H, x % W]
            def best_step_toward_nearest(k: int) -> int:
                preys_alive = [q for q in info["preys"] if q.get("alive", False)]
                if not preys_alive: return 0
                py = int(info["predators"][k]["y"]); px = int(info["predators"][k]["x"])
                best_a, best_d = 0, float("inf")
                for a, (dy, dx) in DIRS.items():
                    ny, nx = (py + dy) % H, (px + dx) % W
                    if not passable(ny, nx): continue
                    d = min(manhattan_torus(ny, nx, int(t["y"]), int(t["x"]), H, W) for t in preys_alive)
                    if d < best_d: best_d, best_a = d, a
                return best_a

            for k in range(agent.team_size):
                stay_streak[k] = stay_streak[k] + 1 if exec_actions[k] == 0 else 0
                if stay_streak[k] >= 3:
                    alt = int(t_actions[k]) if int(t_actions[k]) != 0 else best_step_toward_nearest(k)
                    if alt != 0:
                        exec_actions[k] = alt
                        src_is_teacher[k] = True
                        src_hold[k] = max(src_hold[k], 1)
                        keep_hold[k] = 0
                        stay_streak[k] = 0

            # зафиксировать прошлые действия после anti-idle            for k in range(agent.team_size):
                prev_exec_actions[k] = exec_actions[k]

            # 3.3 ACTION MASK + anti «в одну сtorusону»            def nearest_prey_dist(py, px):
                preys_alive = [q for q in info["preys"] if q.get("alive", False)]
                if not preys_alive: return 0.0
                return min(manhattan_torus(py, px, int(t["y"]), int(t["x"]), H, W) for t in preys_alive)

            for k in range(agent.team_size):
                a = int(exec_actions[k])
                dy, dx = DIRS.get(a, (0,0))
                py = int(info["predators"][k]["y"]); px = int(info["predators"][k]["x"])
                ny, nx = (py + dy) % H, (px + dx) % W
                if a != 0 and not passable(ny, nx):
                    a_t = int(t_actions[k])
                    dy_t, dx_t = DIRS.get(a_t, (0,0))
                    ny_t, nx_t = (py + dy_t) % H, (px + dx_t) % W
                    if a_t != 0 and passable(ny_t, nx_t):
                        exec_actions[k] = a_t
                        src_is_teacher[k] = True
                        src_hold[k] = max(src_hold[k], 1)
                        keep_hold[k] = 0
                    else:
                        exec_actions[k] = best_step_toward_nearest(k)

            N_REPEAT = 6
            if "repeat_cnt" not in locals():
                repeat_cnt = [0]*agent.team_size
                last_action_seen = [None]*agent.team_size
            for k in range(agent.team_size):
                a = int(exec_actions[k])
                if last_action_seen[k] == a:
                    repeat_cnt[k] += 1
                else:
                    repeat_cnt[k] = 1
                    last_action_seen[k] = a

                if repeat_cnt[k] >= N_REPEAT and a != 0:
                    py = int(info["predators"][k]["y"]); px = int(info["predators"][k]["x"])
                    d0 = nearest_prey_dist(py, px)
                    dy, dx = DIRS.get(a, (0,0))
                    ny, nx = (py + dy) % H, (px + dx) % W
                    d1 = nearest_prey_dist(ny, nx)
                    if not passable(ny, nx) or d1 >= d0:
                        alt = int(t_actions[k])
                        dyt, dxt = DIRS.get(alt, (0,0))
                        nyt, nxt = (py + dyt) % H, (px + dxt) % W
                        if alt != 0 and passable(nyt, nxt):
                            exec_actions[k] = alt
                            src_is_teacher[k] = True
                            src_hold[k] = max(src_hold[k], 1)
                            keep_hold[k] = 0
                        else:
                            exec_actions[k] = best_step_toward_nearest(k)
                        repeat_cnt[k] = 0

            # 4) step среды            state, done, new_info = env_wrapper.step(exec_actions)

            # --- метрики            coords = [(int(pr["y"]), int(pr["x"])) for pr in new_info["predators"]]
            idle_now = 0
            if prev_pred_coords is not None:
                for (py0, px0), (py1, px1) in zip(prev_pred_coords, coords):
                    if (py0, px0) == (py1, px1): idle_now += 1
            pair0 = pair1 = 0; sumd = 0.0; pairs = 0
            for i in range(len(coords)):
                for j in range(i + 1, len(coords)):
                    yi, xi = coords[i]; yj, xj = coords[j]
                    d = abs(yi - yj) + abs(xi - xj)
                    if d == 0: pair0 += 1
                    elif d == 1: pair1 += 1
                    sumd += d; pairs += 1
            disp = (sumd / max(1, pairs))
            if (disp <= 1.5) or (pair0 > 0) or (pair1 >= 2): cluster_steps += 1
            if idle_now >= (agent.team_size - 1):
                idle_steps += 1; _idle_streak += 1
            else:
                _idle_streak = 0
            max_idle_streak = max(max_idle_streak, _idle_streak)

            dirs_now = []
            if prev_pred_coords is not None:
                for k, (y1, x1) in enumerate(coords):
                    y0, x0 = prev_pred_coords[k]
                    dirs_now.append((y1 - y0, x1 - x0))
                for k, (dy, dx) in enumerate(dirs_now):
                    if (dy, dx) != (0,0) and (dy, dx) == (-last_dirs_ep[k][0], -last_dirs_ep[k][1]):
                        flip_events += 1
                for i in range(len(coords)):
                    for j in range(i+1, len(coords)):
                        d = abs(coords[i][0]-coords[j][0]) + abs(coords[i][1]-coords[j][1])
                        if d <= 2 and i < len(dirs_now) and j < len(dirs_now):
                            if dirs_now[i] != (0,0) and dirs_now[i] == dirs_now[j]:
                                same_dir_events += 1
                for k in range(len(dirs_now)): last_dirs_ep[k] = dirs_now[k]
            else:
                for k in range(len(last_dirs_ep)): last_dirs_ep[k] = (0,0)
            prev_pred_coords = coords

            # 5) треки/посещения            preds = new_info["predators"]
            for k, pr in enumerate(preds):
                py, px = int(pr["y"]), int(pr["x"])
                predator_paths[k].append((py, px))
                if 0 <= py < H and 0 <= px < W:
                    visited_map_team[py, px] += 1

            # 6) PvP-reward + лог            _, caught_preys = compute_reward_pvp(
                prev_info=info,
                info=new_info,
                caught_preys=caught_preys,
                step_idx=step,
                episode_idx=ep,
                visited_map=visited_map_ep,
                phase="BC-DAgger-PvP",
                actions_exec=exec_actions,
                actions_teacher=t_actions
            )

            # первая поимка            caught_now = sum(1 for p in new_info["preys"] if not p.get("alive", True))
            if first_capture_step is None and caught_now > 0:
                first_capture_step = step

            info = new_info
            if done:
                break

        beta = max(beta_end, beta * beta_decay)

        # сброс лог-буфера        if STEP_LOG_BUFFER:
            with open(LOG_STEP_PATH, "a", newline="") as f:
                csv.writer(f).writerows(STEP_LOG_BUFFER)
            STEP_LOG_BUFFER.clear()

        caught_total = sum(1 for p in info["preys"] if not p.get("alive", True))
        avg_bc = float(np.mean(bc_losses)) if bc_losses else 0.0

        # счёт/результат (мы = team 0, бот = team 1)        sc0 = float(info.get("scores", [0.0, 0.0])[0])
        sc1 = float(info.get("scores", [0.0, 0.0])[1])  # VersusBotEnv кладёт scores в info. :contentReference[oaicite:2]{index=2}        if sc0 > sc1:   result = "WIN"
        elif sc0 < sc1: result = "LOSS"
        else:           result = "DRAW"

        if (ep + 1) % render_every == 0 or ep == 0 or ep == episodes-1:
            gif_path = os.path.join(FRAME_DIR, f"pvp_dagger_ep_{ep:03d}.gif")
            team_map_path = os.path.join(MAP_DIR, f"pvp_team_ep{ep:03d}.png")
            make_color_gif(env_wrapper, gif_path, resize_factor=10, fps=8)
            visualize_team_map(ep, info, visited_map_team, predator_paths, team_map_path)
            display_side_by_side(gif_path, team_map_path, width=300)

        cluster_pct = cluster_steps / max(1, step + 1)
        idle_pct    = idle_steps    / max(1, step + 1)
        flip_rate   = flip_events   / max(1, step + 1)
        same_rate   = same_dir_events / max(1, step + 1)

        print(f"[DAgger-PvP] ep={ep:03d} step={step:03d} beta={beta:.3f} "
              f"caught={caught_total:03d} first_cap={first_capture_step} bc={avg_bc:.4f} "
              f"cluster={cluster_pct:.2f} idle={idle_pct:.2f} flip={flip_rate:.2f} same={same_rate:.2f} "
              f"score_us-bot={sc0:.1f}-{sc1:.1f} result={result}")

        pbar.set_postfix({
            "beta":   f"{beta:.3f}",
            "caught": f"{caught_total:03d}",
            "first":  first_capture_step if first_capture_step is not None else "-",
            "bc":     f"{avg_bc:.3f}",
            "clu":    f"{cluster_pct:.2f}",
            "idle":   f"{idle_pct:.2f}",
            "flip":   f"{flip_rate:.2f}",
            "same":   f"{same_rate:.2f}",
            "score":  f"{sc0:.0f}-{sc1:.0f} {result}",
        })

## Сохранение and экспорт 

In [271]:
# === EXPORT: сохраняем лучшую model в agent.pkl (CPU) ===
def export_agent_pkl(agent, path="agent.pkl"):
    assert agent.model is not None, "Модель ещё не инициализирована"
    obj = {
        "state_dict": copy.deepcopy(agent.model.state_dict()),
        "meta": {
            "n_actions": 5,
            "input_dim": int(agent._input_dim),  # длина векtorusа признаков D            "patch_size": int(PATCH_SIZE),
            "k_nearest": int(K_NEAREST),
            "k_mates": 2,   # в нашем FeatureBuilder по умолчанию        }
    }
    torch.save(obj, path)
    print(f"Сохранено: {os.path.abspath(path)}")

# (опционально) мини-эвал for выбора «лучшего» перед сохранением:# if у тебя уже отучено — просто вызови export_agent_pkl(agent5)

def _export_agent_pkl(agent, path):
    """
    Сохранение чекпойнта модели с автосозданием родительских дирекtorusий.
    Возвращает абсолютный path к файлу.
    """
    import copy, torch, os

    # гарantiруем, что folder существует    parent = os.path.dirname(path)
    if parent:
        os.makedirs(parent, exist_ok=True)

    meta = getattr(agent, "meta", {}) if hasattr(agent, "meta") else {}
    # if нужно, можно добавить input_dim из exampleа входа:    # sample = torch.zeros(1, agent.model.input_dim, device=next(agent.model.parameters()).device)    # meta["input_dim"] = int(sample.shape[1])
    obj = {
        "state_dict": copy.deepcopy(agent.model.state_dict()),
        "optimizer": agent.optimizer.state_dict() if getattr(agent, "optimizer", None) else None,
        "meta": meta
    }
    torch.save(obj, path)
    return os.path.abspath(path)

## Functions обучения учитель-ученик

In [272]:
@torch.no_grad()
def _teacher_logits(agent_teacher: NetAgentShared, feats_np: np.ndarray, device):
    """
    Вычисляет teacher-logits for батча фич (np -> torch).
    Возвращает тензор [B, A].
    """
    x = torch.tensor(feats_np, dtype=torch.float32, device=device)
    agent_teacher.model.eval()
    logits, _ = agent_teacher.model(x)
    return logits


def distill_to_light(agent_teacher: NetAgentShared,
                     agent_student: NetAgentSharedLite,
                     steps: int = 2000,
                     batch: int = 256,
                     temperature: float = 1.5,
                     alpha_kd: float = 0.8):
    """
    Дистилляция из «тяжёлой» модели в «лёгкую».
    Источник данных — общий BC-replay тяжёлого агента (feats/labels уже собирались when DAgger).
    Потери: KD (KL между softmax(teacher/T) and softmax(student/T)) + небольшая CE по teacher-argmax.

    parameters можно уменьшать/увеличивать под лимит времени.
    """
    assert agent_teacher.model is not None, "Teacher не инициализирован"
    # убедимся, что у студента заведен граф с теми же D    if agent_student.model is None:
        # возьмём любую фичу из реплея, чтобы создать model        assert len(agent_teacher.rb_feats) > 0, "Реплей пуст — надо сначала пройти SOLO/PvP DAgger"
        agent_student._ensure_model(agent_teacher.rb_feats[0][None, :])

    device_s = next(agent_student.model.parameters()).device
    agent_student.model.train()

    T = float(temperature)
    kl = nn.KLDivLoss(reduction="batchmean")
    ce = nn.CrossEntropyLoss()

    rng = np.random.default_rng(0)
    for _ in range(steps):
        if len(agent_teacher.rb_feats) == 0:
            break
        idx = rng.integers(0, len(agent_teacher.rb_feats), size=min(batch, len(agent_teacher.rb_feats)))
        feats = np.stack([agent_teacher.rb_feats[i] for i in idx], axis=0)  # [B, D]
        # teacher        with torch.no_grad():
            t_logits = _teacher_logits(agent_teacher, feats, device_s) / T
            t_prob   = torch.softmax(t_logits, dim=-1)

        # student        x = torch.tensor(feats, dtype=torch.float32, device=device_s)
        s_logits, _ = agent_student.model(x)
        s_logits_T = s_logits / T
        s_logprob_T = torch.log_softmax(s_logits_T, dim=-1)

        # KD + чуть-чуть CE по teacher argmax        loss_kd = kl(s_logprob_T, t_prob) * (T * T)
        hard = torch.argmax(t_prob, dim=-1)
        loss_ce = ce(s_logits, hard)
        loss = alpha_kd * loss_kd + (1.0 - alpha_kd) * loss_ce

        agent_student.optimizer.zero_grad(set_to_none=True)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(agent_student.model.parameters(), 1.0)
        agent_student.optimizer.step()

## Loader нескольких карт for команд

In [273]:
def build_pvp_env_mixed(
    team_size: int = TEAM_SIZE,
    step_limit: int = 300,
    spawn_bonus_every: int = 5,
    use_pregenerated_dir: str = None
):
    """
    VersusBotEnv с MixedMapLoader:
      - Две команды hunterов (мы = team 0, бот = team 1)
      - Список разнообразных генераtorusов for ротации типов карт
      - When желании можно добавить pregenerated .npy (if есть folder)
    """
    loaders = []

    # --- ДЮЖИНА РАЗНООБРАЗНЫХ ДВУХКОМАНДНЫХ ГЕНЕРАtorusОВ ---    # Rocks: from разреженных to плотных    for p in [0.01, 0.03, 0.06, 0.10, 0.15]:
        loaders.append(TwoTeamRocksMapLoader(
            size=40, spawn_radius=8, preys_num=100, spawn_points=10,
            rock_spawn_proba=p, additional_rock_spawn_proba=0.20
        ))

    # Labyrinth: from более связных к узким    for add_links in [12, 9, 6, 3, 1]:
        loaders.append(TwoTeamLabyrinthMapLoader(
            size=40, spawn_radius=8, preys_num=100, spawn_points=10,
            additional_links_max=add_links, additional_links_min=max(0, add_links-2)
        ))

    # (опционально) Однокомандные миксы + зеркалка не нужна — просто добавим for разнообразия структуры links/passages.    # Они будут автоматически адаптированы realm'ом; important, что в two-team логике спавн команд уже предусмотрен    # Но чтобы не ломать семantiку two-team задач, по умолчанию не добавляем single-team в PvP.
    # (опционально) folder с pregenerated картами (формат .npy), if есть    if use_pregenerated_dir:
        try:
            loaders.append(PregeneratedMapLoader(dir=use_pregenerated_dir))
        except Exception as e:
            print(f"[WARN] Pregenerated skip: {e}")

    mix_loader = MixedMapLoader(loaders)

    # Бот-оппонент (идёт к ближайшим целям)    bot = ClosestTargetAgent(num_predators=team_size)

    realm = Realm(
        map_loader=mix_loader,
        playable_teams_num=2,             # мы + бот        bots={1: bot},                    # бот = team 1        playable_team_size=team_size,
        step_limit=step_limit,
        spawn_bonus_every=spawn_bonus_every
    )

    base_env = VersusBotEnv(realm)
    env_pvp = RenderedEnvWrapper(base_env)  # твоя же обёртка for кадров    return env_pvp

# === NEW: mixed loader for двух команд (PvP) ===def build_twoteam_mixed_loader(size=40, preys_num=100, team_size=5,
                               rocks_grid=None, lab_links=None, spawn_points=None):
    """
    MixedMapLoader for двух команд: rocks + labyrinth.
    ВНИМАНИЕ: у two-team лоадеров НЕТ аргумента team_size — используем spawn_points.
    spawn_points трактуем as число точек спавна НА КОМАНДУ (обычно = team_size).
    """
    sp = team_size if spawn_points is None else int(spawn_points)
    loaders = []

    # rocks-наборы    if rocks_grid is None:
        rocks_grid = [(0.03, 0.05), (0.07, 0.12), (0.11, 0.18), (0.15, 0.21)]
    for p, ap in rocks_grid:
        loaders.append(
            TwoTeamRocksMapLoader(
                size=size,
                preys_num=preys_num,
                spawn_points=sp,                   # ← вместо team_size                rock_spawn_proba=p,
                additional_rock_spawn_proba=ap
            )
        )

    # labyrinth-наборы    if lab_links is None:
        lab_links = [(20, 10), (12, 6), (8, 3), (3, 1)]
    for lmax, lmin in lab_links:
        loaders.append(
            TwoTeamLabyrinthMapLoader(
                size=size,
                preys_num=preys_num,
                spawn_points=sp,                   # ← вместо team_size                additional_links_max=lmax,
                additional_links_min=lmin
            )
        )

    return MixedMapLoader(loaders)

# === VARIANT B: SOLO loader с полной вариативностью размера and типа maps ===def build_singleteam_mixed_loader_B(
    sizes=(32, 40, 48, 56),
    preys_num_grid=(80, 100, 120),
    spawn_points_grid=None,     # None -> = TEAM_SIZE    rocks_grid=None,
    lab_links=None,
    pregenerated_dir=None,
):
    """
    MixedMapLoader for одиночной команды, но:
      • Несколько размеров карт (sizes)
      • Разные числа жертв (preys_num_grid)
      • Разные spawn_points (if нужно, else = TEAM_SIZE)
      • Богатые сетки параметров for Rocks and Labyrinth
      • (опционально) подмешивание папки с pregenerated .npy

    NB: Realm/Env нормально переносят смену размера между эпизодами — в твоём тренинге
        visited_map_* пересоздаются под текущую карту.
    """
    loaders = []

    # Сетки по умолчанию (достаточно разнообразные)    if rocks_grid is None:
        # (rock_spawn_proba, additional_rock_spawn_proba)        rocks_grid = [(0.01, 0.00), (0.03, 0.05), (0.05, 0.10), (0.07, 0.12),
                      (0.09, 0.15), (0.11, 0.18), (0.13, 0.20), (0.15, 0.21)]
    if lab_links is None:
        # (additional_links_max, additional_links_min)        lab_links = [(24,12), (20,10), (16,8), (12,6),
                     (10,4),  (8,3),  (6,2),  (3,1)]

    for S in sizes:
        for PNUM in preys_num_grid:
            sp_candidates = (spawn_points_grid if spawn_points_grid is not None
                             else (TEAM_SIZE,))
            for SP in sp_candidates:
                # --- Rocks ---                for p, ap in rocks_grid:
                    loaders.append(
                        SingleTeamRocksMapLoader(
                            size=S, preys_num=PNUM, spawn_points=int(SP),
                            rock_spawn_proba=p, additional_rock_spawn_proba=ap
                        )
                    )
                # --- Labyrinth ---                for lmax, lmin in lab_links:
                    loaders.append(
                        SingleTeamLabyrinthMapLoader(
                            size=S, preys_num=PNUM, spawn_points=int(SP),
                            additional_links_max=lmax, additional_links_min=lmin
                        )
                    )

    # pregenerated (if есть что подмешать)    if pregenerated_dir and os.path.isdir(pregenerated_dir) and len(os.listdir(pregenerated_dir)) > 0:
        loaders.append(PregeneratedMapLoader(pregenerated_dir))

    return MixedMapLoader(loaders)

# === VARIANT B: PvP loader с полной вариативностью размеров and параметров ===def build_twoteam_mixed_loader_B(
    sizes=(32, 40, 48, 56),
    preys_num_grid=(80, 100, 120),
    team_size=5,
    spawn_points_grid=None,     # None -> = team_size    rocks_grid=None,
    lab_links=None
):
    """
    MixedMapLoader for двухкомандной PvP-среды с множеством configurations:
      • Несколько размеров карт
      • Разные числа жертв
      • Разные spawn_points на КОМАНДУ (обычно = team_size)
      • Наборы rocks/labyrinth
    """
    sp_default = (team_size,)
    sp_candidates = spawn_points_grid if spawn_points_grid is not None else sp_default

    if rocks_grid is None:
        rocks_grid = [(0.03, 0.05), (0.05, 0.10), (0.07, 0.12), (0.09, 0.15),
                      (0.11, 0.18), (0.13, 0.20), (0.15, 0.21)]
    if lab_links is None:
        lab_links = [(20,10), (16,8), (12,6), (8,3), (3,1)]

    loaders = []
    for S in sizes:
        for PNUM in preys_num_grid:
            for SP in sp_candidates:
                # Rocks                for p, ap in rocks_grid:
                    loaders.append(
                        TwoTeamRocksMapLoader(
                            size=S,
                            preys_num=PNUM,
                            spawn_points=int(SP),    # <= важное: здесь spawn_points, не team_size                            rock_spawn_proba=p,
                            additional_rock_spawn_proba=ap
                        )
                    )
                # Labyrinth                for lmax, lmin in lab_links:
                    loaders.append(
                        TwoTeamLabyrinthMapLoader(
                            size=S,
                            preys_num=PNUM,
                            spawn_points=int(SP),
                            additional_links_max=lmax,
                            additional_links_min=lmin
                        )
                    )
    return MixedMapLoader(loaders)

# === VARIANT B: PvP окружение на полном миксе ===def build_pvp_env_mixed_B(
    team_size: int = TEAM_SIZE,
    step_limit: int = 300,
    spawn_bonus_every: int = 5,
    use_pregenerated_dir: str | None = None
):
    """
    VersusBotEnv с MixedMapLoader на полном миксе размеров/типов for PvP.
    """
    # Подмешаем 2..3 сотни configurations за счёт разных размеров/сеток    mix_loader = build_twoteam_mixed_loader_B(
        sizes=(32, 40, 48, 56),
        preys_num_grid=(80, 100, 120),
        team_size=team_size,
        spawn_points_grid=(team_size, team_size+1),  # иногда спавним на 1 точку больше        rocks_grid=[(0.03, 0.05), (0.07, 0.12), (0.11, 0.18), (0.15, 0.21)],
        lab_links=[(20,10), (12,6), (8,3), (3,1)],
    )

    # (опционально) добавим заранее сгенерённые maps одной строкой:    if use_pregenerated_dir and os.path.isdir(use_pregenerated_dir) and len(os.listdir(use_pregenerated_dir)) > 0:
        # MixedMapLoader умеет принимать список лоадеров; расширим его внутренний пул:        # аккуратно добавим ещё один PregeneratedMapLoader        preg = PregeneratedMapLoader(use_pregenerated_dir)
        # MixedMapLoader([... , preg]) — пересоздадим с добавкой        mix_loader = MixedMapLoader(mix_loader.loaders + [preg])  # у твоего MixedMapLoader есть .loaders
    bot = ClosestTargetAgent(num_predators=team_size)

    realm = Realm(
        map_loader=mix_loader,
        playable_teams_num=2,
        bots={1: bot},
        playable_team_size=team_size,
        step_limit=step_limit,
        spawn_bonus_every=spawn_bonus_every
    )

    base_env = VersusBotEnv(realm)
    return RenderedEnvWrapper(base_env)

## Загрузка модели

In [274]:
def _strip_module_prefix(state_dict: dict):
    """Убираем префикс 'module.' from DataParallel/DistributedDataParallel, if есть."""
    if not state_dict:
        return state_dict
    sample_key = next(iter(state_dict.keys()))
    if sample_key.startswith("module."):
        return {k.replace("module.", "", 1): v for k, v in state_dict.items()}
    return state_dict

def load_agent_from_pkl(agent_obj, ckpt_path: str):
    """
    Грузит веса модели/оптимизаtorusа в существующий agent_obj.
    Поддерживает форматы:
      - torch.save({'model': sd, 'optimizer': sd_opt, ...})
      - torch.save({'state_dict': sd}) / {'model_state_dict': sd}
      - torch.save(sd)  # сам state_dict    """
    assert os.path.exists(ckpt_path), f"Нет файла: {ckpt_path}"
    device = next(agent_obj.model.parameters()).device

    # 1) грузим payload через torch.load (а НЕ pickle.load)    payload = torch.load(ckpt_path, map_location=device)

    # 2) находим state_dict модели в разных вариантах ключей    if isinstance(payload, dict):
        sd = None
        for k in ("model", "state_dict", "model_state_dict"):
            if k in payload and isinstance(payload[k], dict):
                sd = payload[k]
                break
        if sd is None and all(isinstance(k, str) for k in payload.keys()):
            # возможно это and есть state_dict            sd = payload
        if sd is None:
            raise ValueError(f"Не нашёл state_dict в чекпойнте: ключи={list(payload.keys())[:10]}")
        sd = _strip_module_prefix(sd)
        missing, unexpected = agent_obj.model.load_state_dict(sd, strict=False)
        if missing:
            print(f"[WARN] Missing keys: {len(missing)} (первые 5): {missing[:5]}")
        if unexpected:
            print(f"[WARN] Unexpected keys: {len(unexpected)} (первые 5): {unexpected[:5]}")

        # 3) оптимизаtorus (if есть and совместим)        if "optimizer" in payload and getattr(agent_obj, "optimizer", None) is not None:
            try:
                agent_obj.optimizer.load_state_dict(payload["optimizer"])
            except Exception as e:
                print(f"[WARN] Optimizer state не загружен: {e}")
    else:
        # целиком state_dict saved as объект        sd = _strip_module_prefix(payload)
        missing, unexpected = agent_obj.model.load_state_dict(sd, strict=False)
        if missing:
            print(f"[WARN] Missing keys: {len(missing)} (первые 5): {missing[:5]}")
        if unexpected:
            print(f"[WARN] Unexpected keys: {len(unexpected)} (первые 5): {unexpected[:5]}")

    print(f"[OK] Модель загружена из: {ckpt_path} на устройство {device}")
    return agent_obj

## initialization

In [275]:
# === ЕДИНАЯ ЯЧЕЙКА: пресет под ~3 часа CPU (без рендера) ===
# --- тумблеры скорости ---RENDER_EVERY_BIG = 10 # фактически отключить визуализацию
# немного ограничим «тяжёлые» maps: чаще 32/40, реже 48/56SIZES_MIX  = (32, 32, 40, 40, 48)      # 56 убираем (when желании вернёшь)PREYS_GRID = (80, 100)                 # 120 реже/дороже — убрал
TEAM_SIZE  = 5
PATCH_SIZE = PATCH_SIZE
K_NEAREST  = K_NEAREST

# steps на episode: Stage1 чуть больше, Stage2 чуть меньше (PvP дороже)STEPS_STAGE1 = 300
STEPS_STAGE2 = 240

# --- ЭТАП 1: SOLO DAGGER ---mixed_loader = build_singleteam_mixed_loader_B(
    sizes=SIZES_MIX,
    preys_num_grid=PREYS_GRID,
    spawn_points_grid=(TEAM_SIZE,),
    pregenerated_dir=None
)

realm = Realm(
    map_loader=mixed_loader,
    playable_teams_num=1,
    playable_team_size=TEAM_SIZE,
    step_limit=STEPS_STAGE1
)

env = OnePlayerEnv(realm)
env_wrapper = RenderedEnvWrapper(env)

# агентfbuild  = FeatureBuilder(patch_size=PATCH_SIZE, k_nearest=K_NEAREST)
agent   = NetAgentShared(fbuild, team_size=TEAM_SIZE)

print("[STAGE 1] SOLO DAgger...")
train_dagger_multi(
    agent, env_wrapper,
    episodes=400,        # <— основная «тушка» времени    beta_start=0.95,
    beta_end=0.05,
    render_every=(0),
    seed_base=12345
)

# чекпойнтos.makedirs(os.path.join(LOG_DIR, "checkpoints"), exist_ok=True)
ckpt_solo = _export_agent_pkl(agent, path=os.path.join(LOG_DIR, "checkpoints", "agent_solo.pkl"))
shutil.copyfile(ckpt_solo, "agent_solo.pkl")
print("Solo checkpoint:", ckpt_solo)

# --- ЭТАП 2: PvP DAgger против ClosestTarget ---print("[STAGE 2] PvP DAgger vs ClosestTarget...")

mixed_loader_pvp = build_twoteam_mixed_loader_B(
    sizes=SIZES_MIX,
    preys_num_grid=PREYS_GRID,
    team_size=TEAM_SIZE,
    spawn_points_grid=(TEAM_SIZE, TEAM_SIZE+1)  # иногда +1 — ок)

bot_enemy = ClosestTargetAgent(num_predators=TEAM_SIZE)

realm_pvp = Realm(
    map_loader=mixed_loader_pvp,
    playable_teams_num=2,
    bots={1: bot_enemy},
    playable_team_size=TEAM_SIZE,
    step_limit=STEPS_STAGE2
)

env_pvp  = VersusBotEnv(realm_pvp)
envw_pvp = RenderedEnvWrapper(env_pvp)

train_dagger_pvp(
    agent, envw_pvp,
    episodes=300,         # <— баланс со Stage1    beta_start=0.60,
    beta_end=0.05,
    render_every=(10),
    seed_base=777000
)

final_ckpt = _export_agent_pkl(agent, path=os.path.join(LOG_DIR, "checkpoints", "agent_pvp.pkl"))
shutil.copyfile(final_ckpt, "agent.pkl")
print("Final PvP checkpoint:", final_ckpt)
print("Сохранено также в ./agent.pkl")

# --- ЭТАП 3: Distill → Lite + короткий PvP DAgger (ужатый) ---print("[STAGE 3] Distill → Lite model...")

def _ensure_teacher_replay(agent_teacher, env_for_rollout, min_items=800, seed_base=999000):
    if len(agent_teacher.rb_feats) >= min_items:
        return
    print(f"[warmup] teacher replay {len(agent_teacher.rb_feats)} < {min_items} — добиваем коротким роллаутом...")
    import inspect
    teacher_bot = AssignedClosestTargetAgent(num_predators=agent_teacher.team_size)
    episodes_warm, steps_cap = 4, 120
    for ep in range(episodes_warm):
        base_env = getattr(env_for_rollout, "base_env", None)
        seed_ep = seed_base + ep
        did = False
        if base_env is not None and hasattr(base_env, "reset"):
            try:
                if "seed" in inspect.signature(base_env.reset).parameters:
                    state, info = base_env.reset(seed=seed_ep); did = True
            except Exception:
                did = False
        if not did:
            state, info = env_for_rollout.reset()
        teacher_bot.reset(state, team=0)
        for step in range(min(getattr(env_for_rollout.base_env.realm, "step_limit", 240), steps_cap)):
            t_actions = teacher_bot.get_actions(state, team=0)
            _, feats_batch, y_batch = agent_teacher.train_step_bc_multi(info, env_for_rollout.base_env.realm.world.map, t_actions)
            agent_teacher.add_replay(feats_batch, y_batch)
            state, done, info = env_for_rollout.step(t_actions)
            if done: break
        if len(agent_teacher.rb_feats) >= min_items:
            break
    print(f"[warmup] teacher replay filled: {len(agent_teacher.rb_feats)}")

agent_lite = NetAgentSharedLite(fbuild, team_size=TEAM_SIZE)

try:
    _env_for_warmup = envw_pvp
except NameError:
    _env_for_warmup = envw_pvp  # уже есть
_ensure_teacher_replay(agent, _env_for_warmup, min_items=800)

if agent_lite.model is None:
    assert len(agent.rb_feats) > 0, "Пустой реплей учителя — нечем инициализировать lite"
    agent_lite._ensure_model(agent.rb_feats[0][None, :])

# синхронизация девайсов and аккуратный LRdevice_t = next(agent.model.parameters()).device
agent_lite.model.to(device_t)
if getattr(agent_lite, "optimizer", None) is None:
    agent_lite.optimizer = torch.optim.Adam(agent_lite.model.parameters(), lr=LR)
for pg in agent_lite.optimizer.param_groups:
    pg["lr"] = max(pg["lr"] * 0.5, 1e-4)

# дистилляция ужата (сократил steps/batch)distill_to_light(  
    alpha_kd=0.85
)

# короткий PvP DAgger уже лёгкимtrain_dagger_pvp(
    agent_lite, _env_for_warmup,
    episodes=200,
    beta_start=0.50,
    beta_end=0.05,
    render_every=(10),
    seed_base=888000
)

# сохранение litelite_ckpt = _export_agent_pkl(agent_lite, path=os.path.join(LOG_DIR, "checkpoints", "agent_lite.pkl"))
shutil.copyfile(lite_ckpt, "agent_lite.pkl")
print("Lite checkpoint:", lite_ckpt)
print("Сохранено также в ./agent_lite.pkl")

[STAGE 1] SOLO DAgger...


DAgger-5:   0%|          | 0/400 [00:00<?, ?it/s]

[DAgger-5] ep=000 finished at step=095, beta=0.943, caught=080, first_cap=2, bc=1.5539, cluster=0.00, idle=0.00, flip=0.34, same=0.04
[DAgger-5] ep=001 finished at step=099, beta=0.936, caught=080, first_cap=1, bc=1.4282, cluster=0.00, idle=0.00, flip=0.30, same=0.14
[DAgger-5] ep=002 finished at step=091, beta=0.929, caught=080, first_cap=1, bc=1.4094, cluster=0.00, idle=0.00, flip=0.38, same=0.01
[DAgger-5] ep=003 finished at step=085, beta=0.922, caught=080, first_cap=1, bc=1.4011, cluster=0.00, idle=0.00, flip=0.24, same=0.09
[DAgger-5] ep=004 finished at step=108, beta=0.916, caught=080, first_cap=0, bc=1.3815, cluster=0.00, idle=0.00, flip=0.26, same=0.14
[DAgger-5] ep=005 finished at step=086, beta=0.909, caught=080, first_cap=0, bc=1.3831, cluster=0.00, idle=0.00, flip=0.36, same=0.06
[DAgger-5] ep=006 finished at step=129, beta=0.902, caught=080, first_cap=1, bc=1.3758, cluster=0.00, idle=0.00, flip=0.27, same=0.38
[DAgger-5] ep=007 finished at step=092, beta=0.896, caught=080

DAgger-PvP:   0%|          | 0/300 [00:00<?, ?it/s]

GIF сохранён: logs\frames\pvp_dagger_ep_000.gif
Командная карта посещений сохранена: logs\maps\pvp_team_ep000.png


[DAgger-PvP] ep=000 step=054 beta=0.595 caught=080 first_cap=2 bc=0.9730 cluster=0.04 idle=0.00 flip=0.35 same=0.22 score_us-bot=43.2-44.0 result=LOSS
[DAgger-PvP] ep=001 step=066 beta=0.590 caught=080 first_cap=0 bc=1.0468 cluster=0.01 idle=0.01 flip=0.30 same=0.18 score_us-bot=51.0-35.0 result=WIN
[DAgger-PvP] ep=002 step=071 beta=0.585 caught=080 first_cap=0 bc=1.0227 cluster=0.00 idle=0.00 flip=0.28 same=0.03 score_us-bot=56.0-34.8 result=WIN
[DAgger-PvP] ep=003 step=054 beta=0.580 caught=080 first_cap=1 bc=1.0554 cluster=0.00 idle=0.00 flip=0.38 same=0.05 score_us-bot=44.5-44.0 result=WIN
[DAgger-PvP] ep=004 step=061 beta=0.576 caught=080 first_cap=1 bc=1.0181 cluster=0.00 idle=0.00 flip=0.29 same=0.13 score_us-bot=40.0-59.0 result=LOSS
[DAgger-PvP] ep=005 step=066 beta=0.571 caught=080 first_cap=2 bc=0.9904 cluster=0.01 idle=0.01 flip=0.28 same=0.25 score_us-bot=45.8-53.2 result=LOSS
[DAgger-PvP] ep=006 step=064 beta=0.566 caught=080 first_cap=0 bc=1.0324 cluster=0.00 idle=0.00 f

[DAgger-PvP] ep=009 step=060 beta=0.552 caught=080 first_cap=0 bc=0.9632 cluster=0.02 idle=0.00 flip=0.33 same=0.03 score_us-bot=48.0-35.0 result=WIN
[DAgger-PvP] ep=010 step=057 beta=0.548 caught=080 first_cap=1 bc=0.9091 cluster=0.00 idle=0.00 flip=0.21 same=0.10 score_us-bot=41.0-42.8 result=LOSS
[DAgger-PvP] ep=011 step=087 beta=0.543 caught=080 first_cap=0 bc=0.8896 cluster=0.02 idle=0.00 flip=0.45 same=0.16 score_us-bot=53.0-46.5 result=WIN
[DAgger-PvP] ep=012 step=059 beta=0.539 caught=080 first_cap=0 bc=0.9252 cluster=0.00 idle=0.00 flip=0.23 same=0.32 score_us-bot=34.0-58.0 result=LOSS
[DAgger-PvP] ep=013 step=060 beta=0.534 caught=080 first_cap=2 bc=0.9967 cluster=0.00 idle=0.00 flip=0.28 same=0.05 score_us-bot=52.5-46.0 result=WIN
[DAgger-PvP] ep=014 step=053 beta=0.530 caught=080 first_cap=0 bc=0.9874 cluster=0.02 idle=0.00 flip=0.35 same=0.33 score_us-bot=46.0-40.0 result=WIN
[DAgger-PvP] ep=015 step=073 beta=0.525 caught=080 first_cap=1 bc=1.0690 cluster=0.00 idle=0.00 fl

[DAgger-PvP] ep=019 step=052 beta=0.508 caught=080 first_cap=0 bc=0.9283 cluster=0.02 idle=0.00 flip=0.45 same=0.34 score_us-bot=45.8-44.5 result=WIN
[DAgger-PvP] ep=020 step=068 beta=0.504 caught=080 first_cap=1 bc=0.9799 cluster=0.00 idle=0.00 flip=0.30 same=0.20 score_us-bot=52.5-46.5 result=WIN
[DAgger-PvP] ep=021 step=049 beta=0.500 caught=080 first_cap=0 bc=0.9552 cluster=0.02 idle=0.02 flip=0.28 same=0.10 score_us-bot=41.0-45.0 result=LOSS
[DAgger-PvP] ep=022 step=056 beta=0.496 caught=080 first_cap=0 bc=0.9376 cluster=0.09 idle=0.00 flip=0.30 same=0.46 score_us-bot=47.0-44.0 result=WIN
[DAgger-PvP] ep=023 step=073 beta=0.492 caught=080 first_cap=0 bc=0.9401 cluster=0.05 idle=0.00 flip=0.23 same=0.84 score_us-bot=54.8-40.2 result=WIN
[DAgger-PvP] ep=024 step=062 beta=0.487 caught=100 first_cap=1 bc=0.9722 cluster=0.00 idle=0.00 flip=0.17 same=0.11 score_us-bot=64.5-64.0 result=WIN
[DAgger-PvP] ep=025 step=068 beta=0.483 caught=100 first_cap=0 bc=1.0828 cluster=0.00 idle=0.00 fli

[DAgger-PvP] ep=029 step=056 beta=0.468 caught=100 first_cap=0 bc=0.9968 cluster=0.00 idle=0.00 flip=0.51 same=0.32 score_us-bot=53.0-53.0 result=DRAW
[DAgger-PvP] ep=030 step=066 beta=0.464 caught=100 first_cap=0 bc=1.0230 cluster=0.07 idle=0.01 flip=0.28 same=0.16 score_us-bot=54.2-67.0 result=LOSS
[DAgger-PvP] ep=031 step=070 beta=0.460 caught=100 first_cap=0 bc=0.9636 cluster=0.00 idle=0.00 flip=0.39 same=0.14 score_us-bot=54.2-54.0 result=WIN
[DAgger-PvP] ep=032 step=071 beta=0.456 caught=100 first_cap=0 bc=1.0037 cluster=0.03 idle=0.01 flip=0.29 same=0.22 score_us-bot=61.0-62.8 result=LOSS
[DAgger-PvP] ep=033 step=073 beta=0.452 caught=100 first_cap=0 bc=0.9017 cluster=0.03 idle=0.00 flip=0.27 same=0.23 score_us-bot=58.5-57.0 result=WIN
[DAgger-PvP] ep=034 step=069 beta=0.449 caught=100 first_cap=0 bc=0.9456 cluster=0.04 idle=0.00 flip=0.44 same=0.33 score_us-bot=61.2-64.0 result=LOSS
[DAgger-PvP] ep=035 step=083 beta=0.445 caught=100 first_cap=0 bc=0.8952 cluster=0.00 idle=0.00 

[DAgger-PvP] ep=039 step=071 beta=0.430 caught=100 first_cap=0 bc=1.0889 cluster=0.00 idle=0.00 flip=0.39 same=0.10 score_us-bot=58.8-53.0 result=WIN
[DAgger-PvP] ep=040 step=064 beta=0.427 caught=100 first_cap=0 bc=1.1219 cluster=0.00 idle=0.00 flip=0.48 same=0.06 score_us-bot=53.2-54.0 result=LOSS
[DAgger-PvP] ep=041 step=069 beta=0.423 caught=100 first_cap=1 bc=0.9678 cluster=0.03 idle=0.00 flip=0.29 same=0.07 score_us-bot=51.8-60.0 result=LOSS
[DAgger-PvP] ep=042 step=081 beta=0.420 caught=100 first_cap=0 bc=0.9807 cluster=0.00 idle=0.00 flip=0.35 same=0.10 score_us-bot=77.0-44.8 result=WIN
[DAgger-PvP] ep=043 step=064 beta=0.416 caught=100 first_cap=0 bc=0.9674 cluster=0.00 idle=0.00 flip=0.40 same=0.18 score_us-bot=51.8-61.8 result=LOSS
[DAgger-PvP] ep=044 step=087 beta=0.413 caught=100 first_cap=1 bc=0.9913 cluster=0.00 idle=0.00 flip=0.25 same=0.10 score_us-bot=68.0-48.0 result=WIN
[DAgger-PvP] ep=045 step=067 beta=0.409 caught=100 first_cap=0 bc=0.9391 cluster=0.00 idle=0.00 f

[DAgger-PvP] ep=049 step=054 beta=0.396 caught=080 first_cap=0 bc=0.9806 cluster=0.00 idle=0.00 flip=0.56 same=0.05 score_us-bot=45.5-48.8 result=LOSS
[DAgger-PvP] ep=050 step=051 beta=0.393 caught=080 first_cap=0 bc=0.9762 cluster=0.00 idle=0.00 flip=0.31 same=0.02 score_us-bot=50.8-50.0 result=WIN
[DAgger-PvP] ep=051 step=063 beta=0.389 caught=080 first_cap=0 bc=1.0351 cluster=0.00 idle=0.00 flip=0.28 same=0.08 score_us-bot=42.0-45.5 result=LOSS
[DAgger-PvP] ep=052 step=066 beta=0.386 caught=080 first_cap=0 bc=1.0589 cluster=0.00 idle=0.00 flip=0.42 same=0.19 score_us-bot=38.5-52.0 result=LOSS
[DAgger-PvP] ep=053 step=069 beta=0.383 caught=080 first_cap=1 bc=1.0445 cluster=0.00 idle=0.00 flip=0.37 same=0.41 score_us-bot=43.0-43.8 result=LOSS
[DAgger-PvP] ep=054 step=068 beta=0.380 caught=080 first_cap=0 bc=0.9918 cluster=0.00 idle=0.00 flip=0.30 same=0.00 score_us-bot=44.0-43.0 result=WIN
[DAgger-PvP] ep=055 step=060 beta=0.377 caught=080 first_cap=1 bc=0.9478 cluster=0.05 idle=0.00 

[DAgger-PvP] ep=059 step=072 beta=0.364 caught=080 first_cap=0 bc=0.9395 cluster=0.08 idle=0.00 flip=0.37 same=0.45 score_us-bot=59.2-42.5 result=WIN
[DAgger-PvP] ep=060 step=047 beta=0.361 caught=080 first_cap=0 bc=0.9596 cluster=0.00 idle=0.00 flip=0.46 same=0.04 score_us-bot=35.0-48.0 result=LOSS
[DAgger-PvP] ep=061 step=050 beta=0.358 caught=080 first_cap=0 bc=1.0039 cluster=0.00 idle=0.02 flip=0.31 same=0.04 score_us-bot=37.2-52.0 result=LOSS
[DAgger-PvP] ep=062 step=068 beta=0.355 caught=080 first_cap=0 bc=1.0305 cluster=0.03 idle=0.00 flip=0.30 same=0.16 score_us-bot=47.5-49.0 result=LOSS
[DAgger-PvP] ep=063 step=051 beta=0.352 caught=080 first_cap=0 bc=1.0171 cluster=0.00 idle=0.00 flip=0.29 same=0.02 score_us-bot=35.0-48.0 result=LOSS
[DAgger-PvP] ep=064 step=052 beta=0.350 caught=080 first_cap=0 bc=1.0274 cluster=0.00 idle=0.00 flip=0.36 same=0.04 score_us-bot=33.0-50.0 result=LOSS
[DAgger-PvP] ep=065 step=068 beta=0.347 caught=080 first_cap=0 bc=1.0000 cluster=0.00 idle=0.01

[DAgger-PvP] ep=069 step=061 beta=0.335 caught=080 first_cap=0 bc=0.9840 cluster=0.03 idle=0.00 flip=0.37 same=0.15 score_us-bot=39.2-47.0 result=LOSS
[DAgger-PvP] ep=070 step=080 beta=0.333 caught=080 first_cap=1 bc=0.8762 cluster=0.02 idle=0.00 flip=0.33 same=0.54 score_us-bot=49.0-56.0 result=LOSS
[DAgger-PvP] ep=071 step=079 beta=0.330 caught=080 first_cap=1 bc=0.9799 cluster=0.00 idle=0.00 flip=0.31 same=0.15 score_us-bot=49.0-48.5 result=WIN
[DAgger-PvP] ep=072 step=075 beta=0.327 caught=100 first_cap=1 bc=0.9775 cluster=0.00 idle=0.01 flip=0.42 same=0.01 score_us-bot=61.2-63.0 result=LOSS
[DAgger-PvP] ep=073 step=080 beta=0.324 caught=100 first_cap=1 bc=1.0432 cluster=0.00 idle=0.01 flip=0.35 same=0.16 score_us-bot=60.0-62.2 result=LOSS
[DAgger-PvP] ep=074 step=076 beta=0.322 caught=100 first_cap=1 bc=0.9835 cluster=0.00 idle=0.00 flip=0.32 same=0.12 score_us-bot=66.0-57.0 result=WIN
[DAgger-PvP] ep=075 step=079 beta=0.319 caught=100 first_cap=1 bc=1.0453 cluster=0.03 idle=0.00 

[DAgger-PvP] ep=079 step=054 beta=0.309 caught=100 first_cap=0 bc=0.9822 cluster=0.00 idle=0.00 flip=0.22 same=0.20 score_us-bot=50.8-65.0 result=LOSS
[DAgger-PvP] ep=080 step=066 beta=0.306 caught=100 first_cap=0 bc=1.0078 cluster=0.00 idle=0.00 flip=0.36 same=0.06 score_us-bot=52.0-70.0 result=LOSS
[DAgger-PvP] ep=081 step=065 beta=0.304 caught=100 first_cap=0 bc=0.9428 cluster=0.02 idle=0.00 flip=0.47 same=0.08 score_us-bot=68.0-57.5 result=WIN
[DAgger-PvP] ep=082 step=056 beta=0.301 caught=100 first_cap=0 bc=0.9641 cluster=0.05 idle=0.00 flip=0.35 same=0.04 score_us-bot=51.0-63.8 result=LOSS
[DAgger-PvP] ep=083 step=087 beta=0.299 caught=100 first_cap=2 bc=0.9940 cluster=0.00 idle=0.00 flip=0.41 same=0.14 score_us-bot=48.8-72.0 result=LOSS
[DAgger-PvP] ep=084 step=074 beta=0.296 caught=100 first_cap=0 bc=1.0433 cluster=0.05 idle=0.01 flip=0.32 same=0.28 score_us-bot=48.0-55.0 result=LOSS
[DAgger-PvP] ep=085 step=068 beta=0.294 caught=100 first_cap=0 bc=1.1136 cluster=0.01 idle=0.00

[DAgger-PvP] ep=089 step=075 beta=0.284 caught=100 first_cap=0 bc=1.0179 cluster=0.04 idle=0.00 flip=0.29 same=0.05 score_us-bot=55.5-70.2 result=LOSS
[DAgger-PvP] ep=090 step=072 beta=0.282 caught=100 first_cap=0 bc=1.0131 cluster=0.01 idle=0.00 flip=0.40 same=0.07 score_us-bot=56.5-54.5 result=WIN
[DAgger-PvP] ep=091 step=056 beta=0.279 caught=100 first_cap=0 bc=0.8771 cluster=0.00 idle=0.00 flip=0.47 same=0.00 score_us-bot=51.2-57.0 result=LOSS
[DAgger-PvP] ep=092 step=061 beta=0.277 caught=100 first_cap=2 bc=0.9791 cluster=0.05 idle=0.02 flip=0.45 same=0.19 score_us-bot=52.8-59.0 result=LOSS
[DAgger-PvP] ep=093 step=074 beta=0.275 caught=100 first_cap=1 bc=0.9858 cluster=0.04 idle=0.00 flip=0.37 same=0.16 score_us-bot=56.2-68.0 result=LOSS
[DAgger-PvP] ep=094 step=087 beta=0.272 caught=100 first_cap=1 bc=0.9759 cluster=0.00 idle=0.00 flip=0.38 same=0.31 score_us-bot=62.8-54.0 result=WIN
[DAgger-PvP] ep=095 step=127 beta=0.270 caught=100 first_cap=0 bc=0.9174 cluster=0.04 idle=0.00 

[DAgger-PvP] ep=099 step=059 beta=0.261 caught=080 first_cap=1 bc=1.0175 cluster=0.00 idle=0.02 flip=0.32 same=0.10 score_us-bot=50.0-45.0 result=WIN
[DAgger-PvP] ep=100 step=072 beta=0.259 caught=080 first_cap=2 bc=0.9474 cluster=0.00 idle=0.00 flip=0.32 same=0.03 score_us-bot=50.0-53.8 result=LOSS
[DAgger-PvP] ep=101 step=093 beta=0.257 caught=080 first_cap=0 bc=1.0188 cluster=0.00 idle=0.00 flip=0.35 same=0.11 score_us-bot=52.2-56.0 result=LOSS
[DAgger-PvP] ep=102 step=093 beta=0.255 caught=080 first_cap=2 bc=1.0580 cluster=0.02 idle=0.00 flip=0.45 same=0.06 score_us-bot=42.0-41.0 result=WIN
[DAgger-PvP] ep=103 step=090 beta=0.253 caught=080 first_cap=1 bc=0.9731 cluster=0.00 idle=0.00 flip=0.37 same=0.00 score_us-bot=62.0-52.8 result=WIN
[DAgger-PvP] ep=104 step=082 beta=0.251 caught=080 first_cap=2 bc=0.9304 cluster=0.01 idle=0.00 flip=0.30 same=0.17 score_us-bot=51.0-50.0 result=WIN
[DAgger-PvP] ep=105 step=088 beta=0.249 caught=080 first_cap=4 bc=0.8718 cluster=0.00 idle=0.00 fl

[DAgger-PvP] ep=109 step=064 beta=0.241 caught=080 first_cap=0 bc=1.0869 cluster=0.00 idle=0.00 flip=0.23 same=0.05 score_us-bot=28.0-55.0 result=LOSS
[DAgger-PvP] ep=110 step=086 beta=0.239 caught=080 first_cap=1 bc=1.0276 cluster=0.00 idle=0.01 flip=0.45 same=0.09 score_us-bot=41.0-58.0 result=LOSS
[DAgger-PvP] ep=111 step=079 beta=0.237 caught=080 first_cap=1 bc=0.9502 cluster=0.01 idle=0.01 flip=0.26 same=0.10 score_us-bot=51.0-45.5 result=WIN
[DAgger-PvP] ep=112 step=070 beta=0.235 caught=080 first_cap=0 bc=1.0259 cluster=0.00 idle=0.00 flip=0.35 same=0.00 score_us-bot=46.5-53.0 result=LOSS
[DAgger-PvP] ep=113 step=098 beta=0.233 caught=080 first_cap=0 bc=1.0976 cluster=0.00 idle=0.00 flip=0.37 same=0.19 score_us-bot=51.2-53.2 result=LOSS
[DAgger-PvP] ep=114 step=100 beta=0.231 caught=080 first_cap=0 bc=1.0298 cluster=0.03 idle=0.00 flip=0.25 same=0.22 score_us-bot=57.2-48.8 result=WIN
[DAgger-PvP] ep=115 step=089 beta=0.229 caught=080 first_cap=4 bc=0.9218 cluster=0.01 idle=0.00 

[DAgger-PvP] ep=119 step=103 beta=0.221 caught=080 first_cap=3 bc=0.9412 cluster=0.00 idle=0.00 flip=0.38 same=0.02 score_us-bot=41.2-52.8 result=LOSS
[DAgger-PvP] ep=120 step=105 beta=0.219 caught=100 first_cap=0 bc=0.9560 cluster=0.00 idle=0.00 flip=0.26 same=0.11 score_us-bot=56.0-62.0 result=LOSS
[DAgger-PvP] ep=121 step=086 beta=0.218 caught=100 first_cap=1 bc=0.9485 cluster=0.00 idle=0.00 flip=0.25 same=0.08 score_us-bot=46.2-65.0 result=LOSS
[DAgger-PvP] ep=122 step=079 beta=0.216 caught=100 first_cap=0 bc=1.0728 cluster=0.00 idle=0.00 flip=0.38 same=0.10 score_us-bot=46.8-63.0 result=LOSS
[DAgger-PvP] ep=123 step=085 beta=0.214 caught=100 first_cap=0 bc=1.0084 cluster=0.00 idle=0.01 flip=0.34 same=0.07 score_us-bot=57.2-75.0 result=LOSS
[DAgger-PvP] ep=124 step=079 beta=0.212 caught=100 first_cap=1 bc=1.0340 cluster=0.04 idle=0.00 flip=0.21 same=0.35 score_us-bot=59.8-57.0 result=WIN
[DAgger-PvP] ep=125 step=115 beta=0.211 caught=100 first_cap=0 bc=1.1044 cluster=0.02 idle=0.01

[DAgger-PvP] ep=129 step=082 beta=0.204 caught=100 first_cap=0 bc=0.9085 cluster=0.00 idle=0.00 flip=0.36 same=0.17 score_us-bot=48.2-56.0 result=LOSS
[DAgger-PvP] ep=130 step=125 beta=0.202 caught=100 first_cap=0 bc=0.9517 cluster=0.00 idle=0.01 flip=0.40 same=0.03 score_us-bot=57.0-49.2 result=WIN
[DAgger-PvP] ep=131 step=130 beta=0.200 caught=100 first_cap=2 bc=0.8942 cluster=0.01 idle=0.00 flip=0.28 same=0.25 score_us-bot=59.0-63.0 result=LOSS
[DAgger-PvP] ep=132 step=079 beta=0.199 caught=100 first_cap=0 bc=0.9915 cluster=0.03 idle=0.01 flip=0.23 same=0.12 score_us-bot=50.2-60.0 result=LOSS
[DAgger-PvP] ep=133 step=088 beta=0.197 caught=100 first_cap=1 bc=0.9901 cluster=0.02 idle=0.01 flip=0.29 same=0.07 score_us-bot=56.2-58.5 result=LOSS
[DAgger-PvP] ep=134 step=093 beta=0.195 caught=100 first_cap=1 bc=1.0134 cluster=0.00 idle=0.00 flip=0.32 same=0.05 score_us-bot=52.0-60.0 result=LOSS
[DAgger-PvP] ep=135 step=070 beta=0.194 caught=100 first_cap=0 bc=1.0980 cluster=0.00 idle=0.01

[DAgger-PvP] ep=139 step=100 beta=0.187 caught=100 first_cap=2 bc=0.9458 cluster=0.00 idle=0.00 flip=0.41 same=0.04 score_us-bot=67.0-63.0 result=WIN
[DAgger-PvP] ep=140 step=088 beta=0.186 caught=100 first_cap=1 bc=0.9127 cluster=0.02 idle=0.00 flip=0.33 same=0.15 score_us-bot=54.0-54.0 result=DRAW
[DAgger-PvP] ep=141 step=122 beta=0.184 caught=100 first_cap=1 bc=0.9304 cluster=0.00 idle=0.00 flip=0.31 same=0.09 score_us-bot=64.8-65.8 result=LOSS
[DAgger-PvP] ep=142 step=092 beta=0.183 caught=100 first_cap=2 bc=0.9068 cluster=0.02 idle=0.00 flip=0.43 same=0.26 score_us-bot=60.0-53.0 result=WIN
[DAgger-PvP] ep=143 step=145 beta=0.181 caught=100 first_cap=0 bc=0.9075 cluster=0.01 idle=0.00 flip=0.35 same=0.14 score_us-bot=65.5-58.2 result=WIN
[DAgger-PvP] ep=144 step=078 beta=0.180 caught=080 first_cap=1 bc=0.9664 cluster=0.03 idle=0.01 flip=0.30 same=0.11 score_us-bot=34.0-52.0 result=LOSS
[DAgger-PvP] ep=145 step=073 beta=0.178 caught=080 first_cap=0 bc=1.0409 cluster=0.00 idle=0.01 f

[DAgger-PvP] ep=149 step=095 beta=0.172 caught=080 first_cap=0 bc=1.0697 cluster=0.00 idle=0.00 flip=0.41 same=0.07 score_us-bot=41.8-65.0 result=LOSS
[DAgger-PvP] ep=150 step=086 beta=0.171 caught=080 first_cap=0 bc=0.9995 cluster=0.00 idle=0.00 flip=0.32 same=0.17 score_us-bot=44.2-46.0 result=LOSS
[DAgger-PvP] ep=151 step=076 beta=0.170 caught=080 first_cap=0 bc=0.9382 cluster=0.00 idle=0.00 flip=0.32 same=0.31 score_us-bot=47.8-48.0 result=LOSS
[DAgger-PvP] ep=152 step=093 beta=0.168 caught=080 first_cap=3 bc=0.9664 cluster=0.00 idle=0.01 flip=0.31 same=0.09 score_us-bot=35.8-57.0 result=LOSS
[DAgger-PvP] ep=153 step=084 beta=0.167 caught=080 first_cap=0 bc=0.8921 cluster=0.00 idle=0.01 flip=0.47 same=0.12 score_us-bot=32.0-51.0 result=LOSS
[DAgger-PvP] ep=154 step=100 beta=0.165 caught=080 first_cap=1 bc=0.9565 cluster=0.00 idle=0.00 flip=0.40 same=0.04 score_us-bot=44.0-44.0 result=DRAW
[DAgger-PvP] ep=155 step=115 beta=0.164 caught=080 first_cap=0 bc=0.9952 cluster=0.03 idle=0.0

[DAgger-PvP] ep=159 step=089 beta=0.159 caught=080 first_cap=0 bc=1.0326 cluster=0.00 idle=0.01 flip=0.28 same=0.02 score_us-bot=50.0-33.0 result=WIN
[DAgger-PvP] ep=160 step=100 beta=0.157 caught=080 first_cap=4 bc=0.9776 cluster=0.00 idle=0.01 flip=0.35 same=0.04 score_us-bot=37.2-55.0 result=LOSS
[DAgger-PvP] ep=161 step=112 beta=0.156 caught=080 first_cap=0 bc=1.0424 cluster=0.00 idle=0.00 flip=0.30 same=0.28 score_us-bot=46.0-42.5 result=WIN
[DAgger-PvP] ep=162 step=112 beta=0.155 caught=080 first_cap=0 bc=0.9964 cluster=0.01 idle=0.00 flip=0.27 same=0.23 score_us-bot=54.8-57.2 result=LOSS
[DAgger-PvP] ep=163 step=096 beta=0.154 caught=080 first_cap=3 bc=0.9586 cluster=0.01 idle=0.00 flip=0.38 same=0.13 score_us-bot=35.0-51.0 result=LOSS
[DAgger-PvP] ep=164 step=104 beta=0.152 caught=080 first_cap=1 bc=0.8936 cluster=0.02 idle=0.00 flip=0.31 same=0.37 score_us-bot=40.0-48.0 result=LOSS
[DAgger-PvP] ep=165 step=100 beta=0.151 caught=080 first_cap=2 bc=0.9899 cluster=0.00 idle=0.00 

[DAgger-PvP] ep=169 step=072 beta=0.146 caught=100 first_cap=1 bc=0.9844 cluster=0.00 idle=0.00 flip=0.21 same=0.03 score_us-bot=46.0-60.5 result=LOSS
[DAgger-PvP] ep=170 step=097 beta=0.145 caught=100 first_cap=2 bc=1.0018 cluster=0.00 idle=0.01 flip=0.30 same=0.18 score_us-bot=61.5-57.0 result=WIN
[DAgger-PvP] ep=171 step=085 beta=0.144 caught=100 first_cap=1 bc=1.0004 cluster=0.00 idle=0.01 flip=0.36 same=0.00 score_us-bot=46.0-60.5 result=LOSS
[DAgger-PvP] ep=172 step=106 beta=0.142 caught=100 first_cap=1 bc=1.0637 cluster=0.00 idle=0.01 flip=0.36 same=0.09 score_us-bot=50.5-68.0 result=LOSS
[DAgger-PvP] ep=173 step=071 beta=0.141 caught=100 first_cap=1 bc=1.0497 cluster=0.00 idle=0.01 flip=0.28 same=0.06 score_us-bot=47.0-53.0 result=LOSS
[DAgger-PvP] ep=174 step=104 beta=0.140 caught=100 first_cap=0 bc=0.9921 cluster=0.00 idle=0.01 flip=0.45 same=0.02 score_us-bot=45.8-72.0 result=LOSS
[DAgger-PvP] ep=175 step=103 beta=0.139 caught=100 first_cap=0 bc=0.9963 cluster=0.01 idle=0.00

[DAgger-PvP] ep=179 step=093 beta=0.134 caught=100 first_cap=1 bc=0.8641 cluster=0.00 idle=0.00 flip=0.45 same=0.10 score_us-bot=41.0-65.0 result=LOSS
[DAgger-PvP] ep=180 step=091 beta=0.133 caught=100 first_cap=1 bc=0.9622 cluster=0.00 idle=0.00 flip=0.23 same=0.16 score_us-bot=51.2-62.0 result=LOSS
[DAgger-PvP] ep=181 step=098 beta=0.132 caught=100 first_cap=0 bc=1.0237 cluster=0.01 idle=0.01 flip=0.26 same=0.02 score_us-bot=61.0-75.0 result=LOSS
[DAgger-PvP] ep=182 step=079 beta=0.131 caught=100 first_cap=0 bc=1.0952 cluster=0.00 idle=0.00 flip=0.20 same=0.05 score_us-bot=40.0-69.0 result=LOSS
[DAgger-PvP] ep=183 step=097 beta=0.130 caught=100 first_cap=1 bc=1.0664 cluster=0.02 idle=0.01 flip=0.32 same=0.11 score_us-bot=61.8-61.2 result=WIN
[DAgger-PvP] ep=184 step=080 beta=0.129 caught=100 first_cap=0 bc=0.9683 cluster=0.00 idle=0.00 flip=0.41 same=0.09 score_us-bot=60.8-55.0 result=WIN
[DAgger-PvP] ep=185 step=100 beta=0.128 caught=100 first_cap=1 bc=1.0061 cluster=0.00 idle=0.00 

[DAgger-PvP] ep=189 step=105 beta=0.124 caught=100 first_cap=0 bc=0.9176 cluster=0.00 idle=0.00 flip=0.46 same=0.04 score_us-bot=56.5-62.0 result=LOSS
[DAgger-PvP] ep=190 step=108 beta=0.123 caught=100 first_cap=0 bc=1.0328 cluster=0.06 idle=0.01 flip=0.36 same=0.34 score_us-bot=48.8-76.0 result=LOSS
[DAgger-PvP] ep=191 step=107 beta=0.122 caught=100 first_cap=0 bc=1.0483 cluster=0.00 idle=0.00 flip=0.42 same=0.00 score_us-bot=39.0-67.0 result=LOSS
[DAgger-PvP] ep=192 step=112 beta=0.121 caught=080 first_cap=2 bc=0.9699 cluster=0.00 idle=0.01 flip=0.27 same=0.08 score_us-bot=39.0-47.0 result=LOSS
[DAgger-PvP] ep=193 step=099 beta=0.120 caught=080 first_cap=4 bc=1.0350 cluster=0.00 idle=0.01 flip=0.29 same=0.06 score_us-bot=43.8-46.0 result=LOSS
[DAgger-PvP] ep=194 step=111 beta=0.119 caught=080 first_cap=0 bc=1.0214 cluster=0.01 idle=0.01 flip=0.27 same=0.03 score_us-bot=48.8-50.0 result=LOSS
[DAgger-PvP] ep=195 step=101 beta=0.118 caught=080 first_cap=2 bc=1.0243 cluster=0.00 idle=0.0

[DAgger-PvP] ep=199 step=125 beta=0.114 caught=080 first_cap=2 bc=0.9834 cluster=0.00 idle=0.00 flip=0.33 same=0.24 score_us-bot=50.2-49.5 result=WIN
[DAgger-PvP] ep=200 step=133 beta=0.113 caught=080 first_cap=0 bc=0.9782 cluster=0.01 idle=0.00 flip=0.24 same=0.10 score_us-bot=43.2-44.0 result=LOSS
[DAgger-PvP] ep=201 step=150 beta=0.112 caught=080 first_cap=0 bc=0.8867 cluster=0.01 idle=0.01 flip=0.43 same=0.17 score_us-bot=50.0-45.2 result=WIN
[DAgger-PvP] ep=202 step=154 beta=0.111 caught=080 first_cap=2 bc=0.9809 cluster=0.00 idle=0.01 flip=0.45 same=0.07 score_us-bot=43.0-63.0 result=LOSS
[DAgger-PvP] ep=203 step=110 beta=0.110 caught=080 first_cap=0 bc=0.9636 cluster=0.00 idle=0.00 flip=0.36 same=0.26 score_us-bot=36.8-48.0 result=LOSS
[DAgger-PvP] ep=204 step=086 beta=0.109 caught=080 first_cap=0 bc=0.9846 cluster=0.00 idle=0.01 flip=0.29 same=0.07 score_us-bot=42.8-50.0 result=LOSS
[DAgger-PvP] ep=205 step=105 beta=0.108 caught=080 first_cap=3 bc=0.9872 cluster=0.00 idle=0.01 

[DAgger-PvP] ep=209 step=116 beta=0.105 caught=080 first_cap=0 bc=1.0518 cluster=0.00 idle=0.00 flip=0.28 same=0.01 score_us-bot=45.8-52.0 result=LOSS
[DAgger-PvP] ep=210 step=120 beta=0.104 caught=080 first_cap=2 bc=1.0557 cluster=0.00 idle=0.00 flip=0.50 same=0.04 score_us-bot=51.2-52.8 result=LOSS
[DAgger-PvP] ep=211 step=114 beta=0.103 caught=080 first_cap=1 bc=0.9598 cluster=0.00 idle=0.00 flip=0.37 same=0.02 score_us-bot=24.0-59.0 result=LOSS
[DAgger-PvP] ep=212 step=107 beta=0.102 caught=080 first_cap=0 bc=0.9260 cluster=0.04 idle=0.00 flip=0.30 same=0.21 score_us-bot=33.0-53.0 result=LOSS
[DAgger-PvP] ep=213 step=146 beta=0.101 caught=080 first_cap=0 bc=0.8947 cluster=0.10 idle=0.00 flip=0.36 same=0.10 score_us-bot=39.2-66.0 result=LOSS
[DAgger-PvP] ep=214 step=117 beta=0.100 caught=080 first_cap=0 bc=0.9295 cluster=0.01 idle=0.00 flip=0.36 same=0.09 score_us-bot=36.0-47.0 result=LOSS
[DAgger-PvP] ep=215 step=177 beta=0.100 caught=080 first_cap=1 bc=0.9584 cluster=0.06 idle=0.0

[DAgger-PvP] ep=219 step=112 beta=0.096 caught=100 first_cap=1 bc=1.0087 cluster=0.00 idle=0.01 flip=0.31 same=0.00 score_us-bot=59.2-58.0 result=WIN
[DAgger-PvP] ep=220 step=126 beta=0.096 caught=100 first_cap=2 bc=1.0222 cluster=0.01 idle=0.01 flip=0.46 same=0.09 score_us-bot=74.0-48.0 result=WIN
[DAgger-PvP] ep=221 step=154 beta=0.095 caught=100 first_cap=0 bc=1.0273 cluster=0.00 idle=0.01 flip=0.48 same=0.03 score_us-bot=62.0-52.0 result=WIN
[DAgger-PvP] ep=222 step=131 beta=0.094 caught=100 first_cap=3 bc=0.9886 cluster=0.00 idle=0.00 flip=0.33 same=0.04 score_us-bot=59.2-60.0 result=LOSS
[DAgger-PvP] ep=223 step=108 beta=0.093 caught=100 first_cap=2 bc=1.0079 cluster=0.00 idle=0.00 flip=0.38 same=0.12 score_us-bot=39.0-67.0 result=LOSS
[DAgger-PvP] ep=224 step=160 beta=0.092 caught=100 first_cap=1 bc=0.9477 cluster=0.01 idle=0.00 flip=0.37 same=0.37 score_us-bot=54.0-54.5 result=LOSS
[DAgger-PvP] ep=225 step=107 beta=0.092 caught=100 first_cap=2 bc=0.9553 cluster=0.00 idle=0.00 f

[DAgger-PvP] ep=229 step=108 beta=0.089 caught=100 first_cap=3 bc=0.9720 cluster=0.00 idle=0.01 flip=0.35 same=0.13 score_us-bot=50.5-60.0 result=LOSS
[DAgger-PvP] ep=230 step=109 beta=0.088 caught=100 first_cap=2 bc=1.0678 cluster=0.00 idle=0.01 flip=0.28 same=0.00 score_us-bot=41.0-59.0 result=LOSS
[DAgger-PvP] ep=231 step=089 beta=0.087 caught=100 first_cap=0 bc=0.9369 cluster=0.00 idle=0.01 flip=0.26 same=0.06 score_us-bot=41.0-65.0 result=LOSS
[DAgger-PvP] ep=232 step=127 beta=0.087 caught=100 first_cap=1 bc=1.0562 cluster=0.00 idle=0.01 flip=0.30 same=0.18 score_us-bot=68.0-69.0 result=LOSS
[DAgger-PvP] ep=233 step=148 beta=0.086 caught=100 first_cap=1 bc=1.0725 cluster=0.03 idle=0.00 flip=0.36 same=0.17 score_us-bot=66.2-63.0 result=WIN
[DAgger-PvP] ep=234 step=157 beta=0.085 caught=100 first_cap=1 bc=1.0371 cluster=0.00 idle=0.01 flip=0.53 same=0.00 score_us-bot=37.0-69.0 result=LOSS
[DAgger-PvP] ep=235 step=145 beta=0.084 caught=100 first_cap=0 bc=0.9916 cluster=0.06 idle=0.01

[DAgger-PvP] ep=239 step=156 beta=0.082 caught=100 first_cap=0 bc=0.9315 cluster=0.01 idle=0.00 flip=0.52 same=0.15 score_us-bot=43.5-75.5 result=LOSS
[DAgger-PvP] ep=240 step=085 beta=0.081 caught=080 first_cap=0 bc=0.9888 cluster=0.03 idle=0.00 flip=0.28 same=0.26 score_us-bot=50.5-48.2 result=WIN
[DAgger-PvP] ep=241 step=071 beta=0.080 caught=080 first_cap=0 bc=1.0983 cluster=0.03 idle=0.01 flip=0.29 same=0.33 score_us-bot=43.0-46.5 result=LOSS
[DAgger-PvP] ep=242 step=056 beta=0.080 caught=080 first_cap=1 bc=1.0294 cluster=0.00 idle=0.00 flip=0.21 same=0.00 score_us-bot=52.2-61.0 result=LOSS
[DAgger-PvP] ep=243 step=068 beta=0.079 caught=080 first_cap=1 bc=1.1097 cluster=0.04 idle=0.01 flip=0.38 same=0.28 score_us-bot=30.0-65.0 result=LOSS
[DAgger-PvP] ep=244 step=074 beta=0.078 caught=080 first_cap=0 bc=1.0361 cluster=0.04 idle=0.01 flip=0.36 same=0.04 score_us-bot=37.5-51.0 result=LOSS
[DAgger-PvP] ep=245 step=073 beta=0.078 caught=080 first_cap=3 bc=1.0616 cluster=0.03 idle=0.01

[DAgger-PvP] ep=249 step=075 beta=0.075 caught=080 first_cap=2 bc=0.9709 cluster=0.03 idle=0.01 flip=0.34 same=0.03 score_us-bot=47.2-68.0 result=LOSS
[DAgger-PvP] ep=250 step=071 beta=0.075 caught=080 first_cap=0 bc=0.9729 cluster=0.04 idle=0.01 flip=0.43 same=0.24 score_us-bot=36.8-56.0 result=LOSS
[DAgger-PvP] ep=251 step=074 beta=0.074 caught=080 first_cap=1 bc=0.9699 cluster=0.03 idle=0.01 flip=0.28 same=0.44 score_us-bot=40.0-46.0 result=LOSS
[DAgger-PvP] ep=252 step=072 beta=0.073 caught=080 first_cap=0 bc=1.0092 cluster=0.00 idle=0.01 flip=0.34 same=0.27 score_us-bot=46.5-44.0 result=WIN
[DAgger-PvP] ep=253 step=071 beta=0.073 caught=080 first_cap=0 bc=1.0639 cluster=0.00 idle=0.01 flip=0.43 same=0.07 score_us-bot=45.8-51.0 result=LOSS
[DAgger-PvP] ep=254 step=066 beta=0.072 caught=080 first_cap=0 bc=1.0096 cluster=0.00 idle=0.00 flip=0.22 same=0.00 score_us-bot=33.0-53.0 result=LOSS
[DAgger-PvP] ep=255 step=071 beta=0.071 caught=080 first_cap=0 bc=1.0301 cluster=0.00 idle=0.01

[DAgger-PvP] ep=259 step=068 beta=0.069 caught=080 first_cap=0 bc=0.9810 cluster=0.00 idle=0.00 flip=0.38 same=0.09 score_us-bot=42.5-49.0 result=LOSS
[DAgger-PvP] ep=260 step=062 beta=0.069 caught=080 first_cap=1 bc=0.9710 cluster=0.10 idle=0.02 flip=0.27 same=0.25 score_us-bot=49.8-43.0 result=WIN
[DAgger-PvP] ep=261 step=066 beta=0.068 caught=080 first_cap=1 bc=1.0130 cluster=0.00 idle=0.01 flip=0.49 same=0.04 score_us-bot=28.0-58.0 result=LOSS
[DAgger-PvP] ep=262 step=074 beta=0.067 caught=080 first_cap=0 bc=0.8929 cluster=0.03 idle=0.01 flip=0.40 same=0.05 score_us-bot=35.0-61.2 result=LOSS
[DAgger-PvP] ep=263 step=090 beta=0.067 caught=080 first_cap=1 bc=1.0353 cluster=0.03 idle=0.01 flip=0.49 same=0.13 score_us-bot=38.0-42.0 result=LOSS
[DAgger-PvP] ep=264 step=081 beta=0.066 caught=100 first_cap=1 bc=1.1196 cluster=0.00 idle=0.01 flip=0.18 same=0.01 score_us-bot=56.5-63.0 result=LOSS
[DAgger-PvP] ep=265 step=068 beta=0.066 caught=100 first_cap=1 bc=0.9589 cluster=0.00 idle=0.01

[DAgger-PvP] ep=269 step=079 beta=0.064 caught=100 first_cap=2 bc=0.9887 cluster=0.03 idle=0.01 flip=0.34 same=0.14 score_us-bot=51.5-59.8 result=LOSS
[DAgger-PvP] ep=270 step=062 beta=0.063 caught=100 first_cap=1 bc=1.0608 cluster=0.00 idle=0.02 flip=0.38 same=0.03 score_us-bot=55.8-54.0 result=WIN
[DAgger-PvP] ep=271 step=074 beta=0.063 caught=100 first_cap=0 bc=0.9830 cluster=0.03 idle=0.01 flip=0.19 same=0.05 score_us-bot=56.8-60.0 result=LOSS
[DAgger-PvP] ep=272 step=091 beta=0.062 caught=100 first_cap=0 bc=1.0031 cluster=0.08 idle=0.00 flip=0.39 same=0.25 score_us-bot=61.8-59.0 result=WIN
[DAgger-PvP] ep=273 step=083 beta=0.062 caught=100 first_cap=0 bc=0.9643 cluster=0.01 idle=0.01 flip=0.42 same=0.23 score_us-bot=63.5-71.0 result=LOSS
[DAgger-PvP] ep=274 step=095 beta=0.061 caught=100 first_cap=1 bc=0.9114 cluster=0.03 idle=0.01 flip=0.41 same=0.26 score_us-bot=60.2-68.0 result=LOSS
[DAgger-PvP] ep=275 step=093 beta=0.061 caught=100 first_cap=1 bc=0.8278 cluster=0.10 idle=0.00 

[DAgger-PvP] ep=279 step=079 beta=0.059 caught=100 first_cap=1 bc=1.0904 cluster=0.00 idle=0.01 flip=0.36 same=0.04 score_us-bot=58.0-60.2 result=LOSS
[DAgger-PvP] ep=280 step=074 beta=0.058 caught=100 first_cap=0 bc=1.0294 cluster=0.00 idle=0.00 flip=0.31 same=0.16 score_us-bot=56.5-65.0 result=LOSS
[DAgger-PvP] ep=281 step=063 beta=0.058 caught=100 first_cap=0 bc=0.9214 cluster=0.05 idle=0.00 flip=0.36 same=0.12 score_us-bot=60.0-59.5 result=WIN
[DAgger-PvP] ep=282 step=090 beta=0.057 caught=100 first_cap=0 bc=1.0509 cluster=0.01 idle=0.01 flip=0.48 same=0.12 score_us-bot=69.0-59.0 result=WIN
[DAgger-PvP] ep=283 step=072 beta=0.057 caught=100 first_cap=1 bc=0.9911 cluster=0.00 idle=0.00 flip=0.48 same=0.10 score_us-bot=52.0-64.2 result=LOSS
[DAgger-PvP] ep=284 step=080 beta=0.056 caught=100 first_cap=0 bc=0.9108 cluster=0.00 idle=0.01 flip=0.37 same=0.09 score_us-bot=52.5-66.0 result=LOSS
[DAgger-PvP] ep=285 step=067 beta=0.056 caught=100 first_cap=0 bc=1.0185 cluster=0.04 idle=0.01 

[DAgger-PvP] ep=289 step=070 beta=0.054 caught=080 first_cap=0 bc=1.0816 cluster=0.00 idle=0.01 flip=0.34 same=0.03 score_us-bot=50.5-50.0 result=WIN
[DAgger-PvP] ep=290 step=078 beta=0.053 caught=080 first_cap=0 bc=1.0558 cluster=0.01 idle=0.01 flip=0.24 same=0.15 score_us-bot=50.2-49.0 result=WIN
[DAgger-PvP] ep=291 step=073 beta=0.053 caught=080 first_cap=1 bc=1.0094 cluster=0.03 idle=0.01 flip=0.28 same=0.26 score_us-bot=52.5-48.0 result=WIN
[DAgger-PvP] ep=292 step=078 beta=0.053 caught=080 first_cap=0 bc=1.0507 cluster=0.01 idle=0.01 flip=0.27 same=0.29 score_us-bot=49.8-45.5 result=WIN
[DAgger-PvP] ep=293 step=055 beta=0.052 caught=080 first_cap=1 bc=1.0660 cluster=0.02 idle=0.02 flip=0.41 same=0.14 score_us-bot=41.5-52.0 result=LOSS
[DAgger-PvP] ep=294 step=075 beta=0.052 caught=080 first_cap=0 bc=1.0331 cluster=0.11 idle=0.03 flip=0.33 same=0.29 score_us-bot=42.0-55.0 result=LOSS
[DAgger-PvP] ep=295 step=079 beta=0.051 caught=080 first_cap=0 bc=0.9260 cluster=0.15 idle=0.00 fl

[DAgger-PvP] ep=299 step=073 beta=0.050 caught=080 first_cap=0 bc=0.8425 cluster=0.00 idle=0.01 flip=0.41 same=0.07 score_us-bot=40.0-48.0 result=LOSS
Final PvP checkpoint: C:\Users\aveexela\Desktop\rl_project\logs\checkpoints\agent_pvp.pkl
Сохранено также в ./agent.pkl
[STAGE 3] Distill → Lite model...


DAgger-PvP:   0%|          | 0/200 [00:00<?, ?it/s]

GIF сохранён: logs\frames\pvp_dagger_ep_000.gif
Командная карта посещений сохранена: logs\maps\pvp_team_ep000.png


[DAgger-PvP] ep=000 step=064 beta=0.494 caught=080 first_cap=0 bc=1.3111 cluster=0.00 idle=0.00 flip=0.46 same=0.09 score_us-bot=44.0-47.8 result=LOSS
[DAgger-PvP] ep=001 step=054 beta=0.489 caught=080 first_cap=0 bc=1.2954 cluster=0.00 idle=0.00 flip=0.35 same=0.04 score_us-bot=44.0-59.2 result=LOSS
[DAgger-PvP] ep=002 step=064 beta=0.483 caught=080 first_cap=1 bc=1.2339 cluster=0.00 idle=0.00 flip=0.34 same=0.08 score_us-bot=47.8-52.0 result=LOSS
[DAgger-PvP] ep=003 step=056 beta=0.477 caught=080 first_cap=0 bc=1.2089 cluster=0.02 idle=0.00 flip=0.40 same=0.26 score_us-bot=45.0-42.0 result=WIN
[DAgger-PvP] ep=004 step=074 beta=0.472 caught=080 first_cap=0 bc=1.2650 cluster=0.07 idle=0.01 flip=0.28 same=0.23 score_us-bot=42.0-60.0 result=LOSS
[DAgger-PvP] ep=005 step=075 beta=0.466 caught=080 first_cap=0 bc=1.2228 cluster=0.00 idle=0.01 flip=0.34 same=0.09 score_us-bot=39.0-50.2 result=LOSS
[DAgger-PvP] ep=006 step=057 beta=0.461 caught=080 first_cap=0 bc=1.2274 cluster=0.02 idle=0.02

[DAgger-PvP] ep=009 step=056 beta=0.445 caught=080 first_cap=2 bc=1.0427 cluster=0.00 idle=0.00 flip=0.35 same=0.40 score_us-bot=59.8-45.8 result=WIN
[DAgger-PvP] ep=010 step=065 beta=0.440 caught=080 first_cap=1 bc=1.1314 cluster=0.17 idle=0.02 flip=0.33 same=0.14 score_us-bot=47.8-45.0 result=WIN
[DAgger-PvP] ep=011 step=062 beta=0.435 caught=080 first_cap=0 bc=1.2123 cluster=0.00 idle=0.00 flip=0.40 same=0.13 score_us-bot=43.8-44.0 result=LOSS
[DAgger-PvP] ep=012 step=074 beta=0.430 caught=100 first_cap=1 bc=1.2935 cluster=0.00 idle=0.00 flip=0.33 same=0.04 score_us-bot=65.5-57.0 result=WIN
[DAgger-PvP] ep=013 step=076 beta=0.425 caught=100 first_cap=0 bc=1.3073 cluster=0.01 idle=0.00 flip=0.27 same=0.13 score_us-bot=54.2-63.2 result=LOSS
[DAgger-PvP] ep=014 step=059 beta=0.420 caught=100 first_cap=0 bc=1.2951 cluster=0.00 idle=0.00 flip=0.42 same=0.00 score_us-bot=58.0-54.8 result=WIN
[DAgger-PvP] ep=015 step=065 beta=0.415 caught=100 first_cap=0 bc=1.1963 cluster=0.02 idle=0.00 fl

[DAgger-PvP] ep=019 step=081 beta=0.397 caught=100 first_cap=1 bc=1.1047 cluster=0.02 idle=0.01 flip=0.38 same=0.28 score_us-bot=57.8-66.0 result=LOSS
[DAgger-PvP] ep=020 step=082 beta=0.392 caught=100 first_cap=0 bc=1.1423 cluster=0.02 idle=0.01 flip=0.27 same=0.27 score_us-bot=43.0-66.0 result=LOSS
[DAgger-PvP] ep=021 step=067 beta=0.388 caught=100 first_cap=0 bc=1.1165 cluster=0.00 idle=0.00 flip=0.28 same=0.12 score_us-bot=57.8-62.0 result=LOSS
[DAgger-PvP] ep=022 step=065 beta=0.383 caught=100 first_cap=0 bc=1.1476 cluster=0.03 idle=0.02 flip=0.27 same=0.24 score_us-bot=52.0-56.0 result=LOSS
[DAgger-PvP] ep=023 step=095 beta=0.379 caught=100 first_cap=2 bc=1.0852 cluster=0.02 idle=0.00 flip=0.32 same=0.08 score_us-bot=63.0-63.5 result=LOSS
[DAgger-PvP] ep=024 step=077 beta=0.374 caught=100 first_cap=0 bc=1.2170 cluster=0.00 idle=0.01 flip=0.29 same=0.05 score_us-bot=61.0-62.8 result=LOSS
[DAgger-PvP] ep=025 step=071 beta=0.370 caught=100 first_cap=1 bc=1.2561 cluster=0.00 idle=0.0

[DAgger-PvP] ep=029 step=074 beta=0.353 caught=100 first_cap=0 bc=1.2100 cluster=0.00 idle=0.01 flip=0.17 same=0.29 score_us-bot=55.5-71.0 result=LOSS
[DAgger-PvP] ep=030 step=077 beta=0.349 caught=100 first_cap=0 bc=1.1720 cluster=0.00 idle=0.00 flip=0.50 same=0.09 score_us-bot=63.8-65.0 result=LOSS
[DAgger-PvP] ep=031 step=074 beta=0.345 caught=100 first_cap=0 bc=1.1359 cluster=0.04 idle=0.01 flip=0.39 same=0.15 score_us-bot=56.0-63.0 result=LOSS
[DAgger-PvP] ep=032 step=079 beta=0.341 caught=100 first_cap=0 bc=1.1228 cluster=0.05 idle=0.01 flip=0.26 same=0.26 score_us-bot=49.0-63.8 result=LOSS
[DAgger-PvP] ep=033 step=076 beta=0.337 caught=100 first_cap=0 bc=1.1435 cluster=0.00 idle=0.00 flip=0.42 same=0.22 score_us-bot=63.0-67.8 result=LOSS
[DAgger-PvP] ep=034 step=092 beta=0.333 caught=100 first_cap=0 bc=1.1342 cluster=0.01 idle=0.00 flip=0.27 same=0.30 score_us-bot=75.5-59.2 result=WIN
[DAgger-PvP] ep=035 step=086 beta=0.330 caught=100 first_cap=0 bc=1.1505 cluster=0.07 idle=0.01

[DAgger-PvP] ep=039 step=089 beta=0.315 caught=080 first_cap=1 bc=1.1688 cluster=0.00 idle=0.00 flip=0.33 same=0.09 score_us-bot=45.8-47.2 result=LOSS
[DAgger-PvP] ep=040 step=080 beta=0.311 caught=080 first_cap=0 bc=1.1748 cluster=0.00 idle=0.00 flip=0.28 same=0.05 score_us-bot=40.0-45.0 result=LOSS
[DAgger-PvP] ep=041 step=092 beta=0.308 caught=080 first_cap=0 bc=1.1901 cluster=0.00 idle=0.00 flip=0.37 same=0.06 score_us-bot=47.0-44.5 result=WIN
[DAgger-PvP] ep=042 step=101 beta=0.304 caught=080 first_cap=5 bc=1.1958 cluster=0.00 idle=0.00 flip=0.39 same=0.00 score_us-bot=51.0-46.2 result=WIN
[DAgger-PvP] ep=043 step=080 beta=0.301 caught=080 first_cap=2 bc=1.1348 cluster=0.00 idle=0.00 flip=0.30 same=0.10 score_us-bot=34.0-61.0 result=LOSS
[DAgger-PvP] ep=044 step=075 beta=0.297 caught=080 first_cap=2 bc=1.1250 cluster=0.00 idle=0.01 flip=0.30 same=0.01 score_us-bot=39.2-50.0 result=LOSS
[DAgger-PvP] ep=045 step=086 beta=0.294 caught=080 first_cap=1 bc=1.1003 cluster=0.02 idle=0.00 

[DAgger-PvP] ep=049 step=068 beta=0.280 caught=080 first_cap=1 bc=1.1863 cluster=0.00 idle=0.00 flip=0.33 same=0.00 score_us-bot=43.5-40.0 result=WIN
[DAgger-PvP] ep=050 step=074 beta=0.277 caught=080 first_cap=0 bc=1.1621 cluster=0.00 idle=0.01 flip=0.24 same=0.19 score_us-bot=54.0-50.0 result=WIN
[DAgger-PvP] ep=051 step=076 beta=0.274 caught=080 first_cap=2 bc=1.1505 cluster=0.00 idle=0.01 flip=0.21 same=0.09 score_us-bot=44.8-45.0 result=LOSS
[DAgger-PvP] ep=052 step=081 beta=0.271 caught=080 first_cap=0 bc=1.2001 cluster=0.00 idle=0.00 flip=0.44 same=0.01 score_us-bot=47.0-47.2 result=LOSS
[DAgger-PvP] ep=053 step=094 beta=0.268 caught=080 first_cap=1 bc=1.1311 cluster=0.02 idle=0.00 flip=0.51 same=0.18 score_us-bot=44.0-46.0 result=LOSS
[DAgger-PvP] ep=054 step=078 beta=0.265 caught=080 first_cap=0 bc=1.2374 cluster=0.00 idle=0.01 flip=0.41 same=0.13 score_us-bot=56.2-39.0 result=WIN
[DAgger-PvP] ep=055 step=091 beta=0.262 caught=080 first_cap=0 bc=1.1468 cluster=0.00 idle=0.01 f

[DAgger-PvP] ep=059 step=112 beta=0.250 caught=080 first_cap=3 bc=1.0253 cluster=0.00 idle=0.00 flip=0.22 same=0.19 score_us-bot=21.2-69.0 result=LOSS
[DAgger-PvP] ep=060 step=103 beta=0.247 caught=100 first_cap=1 bc=1.2183 cluster=0.02 idle=0.01 flip=0.30 same=0.09 score_us-bot=56.8-65.0 result=LOSS
[DAgger-PvP] ep=061 step=091 beta=0.244 caught=100 first_cap=0 bc=1.1629 cluster=0.00 idle=0.01 flip=0.32 same=0.27 score_us-bot=60.5-67.0 result=LOSS
[DAgger-PvP] ep=062 step=082 beta=0.241 caught=100 first_cap=2 bc=1.2395 cluster=0.00 idle=0.01 flip=0.34 same=0.04 score_us-bot=58.8-58.0 result=WIN
[DAgger-PvP] ep=063 step=094 beta=0.238 caught=100 first_cap=1 bc=1.2270 cluster=0.00 idle=0.00 flip=0.28 same=0.19 score_us-bot=58.2-57.0 result=WIN
[DAgger-PvP] ep=064 step=090 beta=0.236 caught=100 first_cap=0 bc=1.1620 cluster=0.00 idle=0.01 flip=0.29 same=0.08 score_us-bot=42.0-67.0 result=LOSS
[DAgger-PvP] ep=065 step=100 beta=0.233 caught=100 first_cap=0 bc=1.2029 cluster=0.01 idle=0.00 

[DAgger-PvP] ep=069 step=118 beta=0.222 caught=100 first_cap=1 bc=1.0770 cluster=0.02 idle=0.01 flip=0.33 same=0.18 score_us-bot=60.0-66.0 result=LOSS
[DAgger-PvP] ep=070 step=089 beta=0.220 caught=100 first_cap=1 bc=1.1044 cluster=0.00 idle=0.00 flip=0.46 same=0.04 score_us-bot=56.5-59.2 result=LOSS
[DAgger-PvP] ep=071 step=113 beta=0.217 caught=100 first_cap=1 bc=1.0418 cluster=0.00 idle=0.00 flip=0.30 same=0.15 score_us-bot=36.5-71.0 result=LOSS
[DAgger-PvP] ep=072 step=079 beta=0.215 caught=100 first_cap=2 bc=1.2018 cluster=0.00 idle=0.01 flip=0.34 same=0.10 score_us-bot=47.0-76.0 result=LOSS
[DAgger-PvP] ep=073 step=085 beta=0.212 caught=100 first_cap=1 bc=1.2273 cluster=0.00 idle=0.01 flip=0.38 same=0.06 score_us-bot=41.0-70.0 result=LOSS
[DAgger-PvP] ep=074 step=075 beta=0.210 caught=100 first_cap=0 bc=1.2260 cluster=0.00 idle=0.01 flip=0.38 same=0.01 score_us-bot=59.2-47.0 result=WIN
[DAgger-PvP] ep=075 step=092 beta=0.208 caught=100 first_cap=2 bc=1.2058 cluster=0.00 idle=0.01

[DAgger-PvP] ep=079 step=096 beta=0.198 caught=100 first_cap=5 bc=1.0806 cluster=0.02 idle=0.01 flip=0.41 same=0.19 score_us-bot=53.2-71.0 result=LOSS
[DAgger-PvP] ep=080 step=094 beta=0.196 caught=100 first_cap=2 bc=1.0806 cluster=0.00 idle=0.01 flip=0.40 same=0.00 score_us-bot=62.8-67.0 result=LOSS
[DAgger-PvP] ep=081 step=096 beta=0.194 caught=100 first_cap=0 bc=1.1056 cluster=0.00 idle=0.00 flip=0.28 same=0.14 score_us-bot=54.0-52.0 result=WIN
[DAgger-PvP] ep=082 step=119 beta=0.191 caught=100 first_cap=0 bc=1.0709 cluster=0.04 idle=0.01 flip=0.27 same=0.29 score_us-bot=55.5-72.0 result=LOSS
[DAgger-PvP] ep=083 step=107 beta=0.189 caught=100 first_cap=3 bc=1.0596 cluster=0.00 idle=0.00 flip=0.31 same=0.16 score_us-bot=60.8-58.0 result=WIN
[DAgger-PvP] ep=084 step=074 beta=0.187 caught=080 first_cap=2 bc=1.2507 cluster=0.00 idle=0.01 flip=0.29 same=0.03 score_us-bot=48.0-46.0 result=WIN
[DAgger-PvP] ep=085 step=085 beta=0.185 caught=080 first_cap=1 bc=1.1932 cluster=0.00 idle=0.01 f

[DAgger-PvP] ep=089 step=084 beta=0.176 caught=080 first_cap=4 bc=1.1862 cluster=0.00 idle=0.01 flip=0.31 same=0.01 score_us-bot=38.0-42.0 result=LOSS
[DAgger-PvP] ep=090 step=090 beta=0.174 caught=080 first_cap=0 bc=1.1326 cluster=0.00 idle=0.00 flip=0.38 same=0.07 score_us-bot=41.0-49.0 result=LOSS
[DAgger-PvP] ep=091 step=092 beta=0.172 caught=080 first_cap=2 bc=1.0859 cluster=0.01 idle=0.01 flip=0.31 same=0.33 score_us-bot=47.0-50.0 result=LOSS
[DAgger-PvP] ep=092 step=065 beta=0.170 caught=080 first_cap=1 bc=1.1389 cluster=0.00 idle=0.02 flip=0.50 same=0.09 score_us-bot=41.2-45.0 result=LOSS
[DAgger-PvP] ep=093 step=087 beta=0.169 caught=080 first_cap=2 bc=1.1020 cluster=0.00 idle=0.01 flip=0.36 same=0.19 score_us-bot=52.0-52.0 result=DRAW
[DAgger-PvP] ep=094 step=103 beta=0.167 caught=080 first_cap=3 bc=1.0677 cluster=0.06 idle=0.01 flip=0.25 same=0.49 score_us-bot=52.0-41.2 result=WIN
[DAgger-PvP] ep=095 step=111 beta=0.165 caught=080 first_cap=0 bc=1.1153 cluster=0.00 idle=0.00

[DAgger-PvP] ep=099 step=089 beta=0.157 caught=080 first_cap=1 bc=1.2363 cluster=0.01 idle=0.00 flip=0.44 same=0.07 score_us-bot=42.0-50.0 result=LOSS
[DAgger-PvP] ep=100 step=085 beta=0.155 caught=080 first_cap=0 bc=1.1635 cluster=0.00 idle=0.00 flip=0.24 same=0.14 score_us-bot=55.2-48.0 result=WIN
[DAgger-PvP] ep=101 step=073 beta=0.154 caught=080 first_cap=0 bc=1.1553 cluster=0.00 idle=0.01 flip=0.31 same=0.01 score_us-bot=36.0-47.0 result=LOSS
[DAgger-PvP] ep=102 step=095 beta=0.152 caught=080 first_cap=1 bc=1.2023 cluster=0.01 idle=0.00 flip=0.38 same=0.15 score_us-bot=36.2-51.0 result=LOSS
[DAgger-PvP] ep=103 step=079 beta=0.150 caught=080 first_cap=1 bc=1.1077 cluster=0.03 idle=0.01 flip=0.26 same=0.16 score_us-bot=45.0-54.0 result=LOSS
[DAgger-PvP] ep=104 step=108 beta=0.148 caught=080 first_cap=1 bc=1.0410 cluster=0.00 idle=0.01 flip=0.28 same=0.00 score_us-bot=65.0-60.8 result=WIN
[DAgger-PvP] ep=105 step=103 beta=0.147 caught=080 first_cap=2 bc=1.0812 cluster=0.01 idle=0.00 

[DAgger-PvP] ep=109 step=085 beta=0.140 caught=100 first_cap=1 bc=1.2595 cluster=0.00 idle=0.01 flip=0.33 same=0.09 score_us-bot=71.0-58.0 result=WIN
[DAgger-PvP] ep=110 step=096 beta=0.138 caught=100 first_cap=2 bc=1.2091 cluster=0.00 idle=0.01 flip=0.34 same=0.01 score_us-bot=52.2-60.0 result=LOSS
[DAgger-PvP] ep=111 step=081 beta=0.137 caught=100 first_cap=2 bc=1.2228 cluster=0.00 idle=0.01 flip=0.35 same=0.07 score_us-bot=47.5-63.0 result=LOSS
[DAgger-PvP] ep=112 step=094 beta=0.135 caught=100 first_cap=0 bc=1.2280 cluster=0.04 idle=0.00 flip=0.39 same=0.33 score_us-bot=53.2-54.0 result=LOSS
[DAgger-PvP] ep=113 step=087 beta=0.134 caught=100 first_cap=0 bc=1.1550 cluster=0.00 idle=0.01 flip=0.57 same=0.09 score_us-bot=51.5-62.0 result=LOSS
[DAgger-PvP] ep=114 step=110 beta=0.132 caught=100 first_cap=0 bc=1.1336 cluster=0.03 idle=0.00 flip=0.37 same=0.05 score_us-bot=47.5-71.0 result=LOSS
[DAgger-PvP] ep=115 step=072 beta=0.131 caught=100 first_cap=0 bc=1.1261 cluster=0.00 idle=0.01

[DAgger-PvP] ep=119 step=126 beta=0.125 caught=100 first_cap=1 bc=1.0058 cluster=0.04 idle=0.02 flip=0.25 same=0.16 score_us-bot=59.2-70.5 result=LOSS
[DAgger-PvP] ep=120 step=093 beta=0.123 caught=100 first_cap=2 bc=1.2689 cluster=0.00 idle=0.01 flip=0.26 same=0.04 score_us-bot=65.5-56.0 result=WIN
[DAgger-PvP] ep=121 step=100 beta=0.122 caught=100 first_cap=0 bc=1.1723 cluster=0.02 idle=0.00 flip=0.27 same=0.08 score_us-bot=54.2-73.0 result=LOSS
[DAgger-PvP] ep=122 step=092 beta=0.120 caught=100 first_cap=0 bc=1.1622 cluster=0.03 idle=0.01 flip=0.34 same=0.32 score_us-bot=38.0-62.0 result=LOSS
[DAgger-PvP] ep=123 step=075 beta=0.119 caught=100 first_cap=0 bc=1.1948 cluster=0.05 idle=0.00 flip=0.36 same=0.24 score_us-bot=49.2-57.0 result=LOSS
[DAgger-PvP] ep=124 step=085 beta=0.118 caught=100 first_cap=1 bc=1.1927 cluster=0.00 idle=0.01 flip=0.34 same=0.02 score_us-bot=57.5-60.2 result=LOSS
[DAgger-PvP] ep=125 step=094 beta=0.116 caught=100 first_cap=0 bc=1.1539 cluster=0.00 idle=0.01

[DAgger-PvP] ep=129 step=110 beta=0.111 caught=100 first_cap=2 bc=1.1095 cluster=0.00 idle=0.00 flip=0.28 same=0.06 score_us-bot=54.2-57.8 result=LOSS
[DAgger-PvP] ep=130 step=101 beta=0.110 caught=100 first_cap=1 bc=1.0908 cluster=0.01 idle=0.01 flip=0.35 same=0.13 score_us-bot=52.2-57.0 result=LOSS
[DAgger-PvP] ep=131 step=130 beta=0.109 caught=100 first_cap=2 bc=1.0471 cluster=0.03 idle=0.02 flip=0.27 same=0.24 score_us-bot=52.0-58.5 result=LOSS
[DAgger-PvP] ep=132 step=080 beta=0.107 caught=080 first_cap=1 bc=1.1710 cluster=0.00 idle=0.01 flip=0.36 same=0.06 score_us-bot=36.5-56.0 result=LOSS
[DAgger-PvP] ep=133 step=087 beta=0.106 caught=080 first_cap=2 bc=1.2102 cluster=0.00 idle=0.01 flip=0.34 same=0.14 score_us-bot=44.2-58.0 result=LOSS
[DAgger-PvP] ep=134 step=097 beta=0.105 caught=080 first_cap=0 bc=1.2528 cluster=0.00 idle=0.01 flip=0.27 same=0.09 score_us-bot=38.0-54.0 result=LOSS
[DAgger-PvP] ep=135 step=087 beta=0.104 caught=080 first_cap=0 bc=1.1631 cluster=0.00 idle=0.0

[DAgger-PvP] ep=139 step=099 beta=0.099 caught=080 first_cap=0 bc=1.0826 cluster=0.00 idle=0.00 flip=0.29 same=0.13 score_us-bot=48.0-35.0 result=WIN
[DAgger-PvP] ep=140 step=137 beta=0.098 caught=080 first_cap=0 bc=1.0786 cluster=0.03 idle=0.01 flip=0.42 same=0.12 score_us-bot=41.5-47.0 result=LOSS
[DAgger-PvP] ep=141 step=111 beta=0.097 caught=080 first_cap=1 bc=1.0635 cluster=0.01 idle=0.01 flip=0.29 same=0.12 score_us-bot=40.8-53.0 result=LOSS
[DAgger-PvP] ep=142 step=103 beta=0.096 caught=080 first_cap=1 bc=1.0704 cluster=0.00 idle=0.01 flip=0.26 same=0.07 score_us-bot=29.0-54.0 result=LOSS
[DAgger-PvP] ep=143 step=180 beta=0.094 caught=080 first_cap=0 bc=1.0373 cluster=0.03 idle=0.01 flip=0.24 same=0.19 score_us-bot=28.0-61.0 result=LOSS
[DAgger-PvP] ep=144 step=096 beta=0.093 caught=080 first_cap=1 bc=1.1936 cluster=0.00 idle=0.01 flip=0.37 same=0.04 score_us-bot=28.0-52.0 result=LOSS
[DAgger-PvP] ep=145 step=090 beta=0.092 caught=080 first_cap=3 bc=1.2492 cluster=0.00 idle=0.01

[DAgger-PvP] ep=149 step=100 beta=0.088 caught=080 first_cap=1 bc=1.1697 cluster=0.00 idle=0.00 flip=0.30 same=0.01 score_us-bot=39.5-67.0 result=LOSS
[DAgger-PvP] ep=150 step=106 beta=0.087 caught=080 first_cap=3 bc=1.1762 cluster=0.00 idle=0.01 flip=0.50 same=0.05 score_us-bot=42.0-54.0 result=LOSS
[DAgger-PvP] ep=151 step=149 beta=0.086 caught=080 first_cap=2 bc=1.0423 cluster=0.00 idle=0.01 flip=0.33 same=0.02 score_us-bot=52.0-68.0 result=LOSS
[DAgger-PvP] ep=152 step=114 beta=0.085 caught=080 first_cap=0 bc=1.0236 cluster=0.00 idle=0.01 flip=0.35 same=0.00 score_us-bot=36.0-47.0 result=LOSS
[DAgger-PvP] ep=153 step=113 beta=0.084 caught=080 first_cap=2 bc=1.0956 cluster=0.02 idle=0.01 flip=0.32 same=0.33 score_us-bot=42.2-51.0 result=LOSS
[DAgger-PvP] ep=154 step=161 beta=0.083 caught=080 first_cap=2 bc=1.0433 cluster=0.09 idle=0.07 flip=0.33 same=0.28 score_us-bot=42.2-65.0 result=LOSS
[DAgger-PvP] ep=155 step=132 beta=0.082 caught=080 first_cap=1 bc=1.1161 cluster=0.00 idle=0.0

[DAgger-PvP] ep=159 step=144 beta=0.079 caught=100 first_cap=1 bc=1.2010 cluster=0.00 idle=0.00 flip=0.39 same=0.05 score_us-bot=57.5-59.2 result=LOSS
[DAgger-PvP] ep=160 step=115 beta=0.078 caught=100 first_cap=2 bc=1.1395 cluster=0.01 idle=0.01 flip=0.32 same=0.08 score_us-bot=73.2-74.5 result=LOSS
[DAgger-PvP] ep=161 step=105 beta=0.077 caught=100 first_cap=1 bc=1.1588 cluster=0.00 idle=0.01 flip=0.39 same=0.08 score_us-bot=61.0-60.0 result=WIN
[DAgger-PvP] ep=162 step=143 beta=0.076 caught=100 first_cap=1 bc=1.1606 cluster=0.00 idle=0.01 flip=0.38 same=0.06 score_us-bot=43.0-60.0 result=LOSS
[DAgger-PvP] ep=163 step=142 beta=0.075 caught=100 first_cap=2 bc=1.0713 cluster=0.01 idle=0.01 flip=0.38 same=0.13 score_us-bot=49.8-73.8 result=LOSS
[DAgger-PvP] ep=164 step=143 beta=0.074 caught=100 first_cap=0 bc=1.0728 cluster=0.00 idle=0.00 flip=0.40 same=0.15 score_us-bot=62.2-61.0 result=WIN
[DAgger-PvP] ep=165 step=123 beta=0.073 caught=100 first_cap=0 bc=1.0911 cluster=0.00 idle=0.01 

[DAgger-PvP] ep=169 step=101 beta=0.070 caught=100 first_cap=0 bc=1.2456 cluster=0.02 idle=0.01 flip=0.30 same=0.12 score_us-bot=37.0-72.0 result=LOSS
[DAgger-PvP] ep=170 step=087 beta=0.069 caught=100 first_cap=2 bc=1.1694 cluster=0.00 idle=0.00 flip=0.30 same=0.01 score_us-bot=46.0-57.0 result=LOSS
[DAgger-PvP] ep=171 step=100 beta=0.068 caught=100 first_cap=0 bc=1.1660 cluster=0.00 idle=0.00 flip=0.43 same=0.05 score_us-bot=53.0-66.0 result=LOSS
[DAgger-PvP] ep=172 step=116 beta=0.068 caught=100 first_cap=2 bc=1.1935 cluster=0.00 idle=0.01 flip=0.43 same=0.12 score_us-bot=63.5-65.0 result=LOSS
[DAgger-PvP] ep=173 step=120 beta=0.067 caught=100 first_cap=0 bc=1.1210 cluster=0.00 idle=0.01 flip=0.35 same=0.10 score_us-bot=41.0-68.0 result=LOSS
[DAgger-PvP] ep=174 step=167 beta=0.066 caught=100 first_cap=0 bc=1.0883 cluster=0.02 idle=0.00 flip=0.46 same=0.16 score_us-bot=42.8-73.0 result=LOSS
[DAgger-PvP] ep=175 step=132 beta=0.065 caught=100 first_cap=0 bc=1.0587 cluster=0.00 idle=0.0

[DAgger-PvP] ep=179 step=175 beta=0.062 caught=100 first_cap=1 bc=1.0878 cluster=0.01 idle=0.01 flip=0.40 same=0.07 score_us-bot=62.0-61.5 result=WIN
[DAgger-PvP] ep=180 step=071 beta=0.062 caught=080 first_cap=1 bc=1.1408 cluster=0.00 idle=0.01 flip=0.31 same=0.08 score_us-bot=31.0-64.0 result=LOSS
[DAgger-PvP] ep=181 step=058 beta=0.061 caught=080 first_cap=2 bc=1.2435 cluster=0.00 idle=0.02 flip=0.34 same=0.02 score_us-bot=37.8-51.0 result=LOSS
[DAgger-PvP] ep=182 step=060 beta=0.060 caught=080 first_cap=0 bc=1.2151 cluster=0.03 idle=0.02 flip=0.28 same=0.05 score_us-bot=49.2-51.0 result=LOSS
[DAgger-PvP] ep=183 step=057 beta=0.059 caught=080 first_cap=0 bc=1.2025 cluster=0.02 idle=0.02 flip=0.31 same=0.10 score_us-bot=29.0-54.0 result=LOSS
[DAgger-PvP] ep=184 step=069 beta=0.059 caught=080 first_cap=0 bc=1.1480 cluster=0.03 idle=0.01 flip=0.43 same=0.16 score_us-bot=34.0-52.0 result=LOSS
[DAgger-PvP] ep=185 step=080 beta=0.058 caught=080 first_cap=2 bc=1.1432 cluster=0.00 idle=0.01

[DAgger-PvP] ep=189 step=084 beta=0.055 caught=080 first_cap=0 bc=1.0661 cluster=0.06 idle=0.01 flip=0.38 same=0.08 score_us-bot=45.2-66.0 result=LOSS
[DAgger-PvP] ep=190 step=063 beta=0.055 caught=080 first_cap=0 bc=1.0845 cluster=0.00 idle=0.02 flip=0.42 same=0.12 score_us-bot=50.2-52.0 result=LOSS
[DAgger-PvP] ep=191 step=072 beta=0.054 caught=080 first_cap=1 bc=1.0383 cluster=0.05 idle=0.01 flip=0.41 same=0.33 score_us-bot=37.0-46.0 result=LOSS
[DAgger-PvP] ep=192 step=075 beta=0.054 caught=080 first_cap=0 bc=1.1243 cluster=0.00 idle=0.01 flip=0.28 same=0.16 score_us-bot=53.2-50.2 result=WIN
[DAgger-PvP] ep=193 step=074 beta=0.053 caught=080 first_cap=0 bc=1.1605 cluster=0.01 idle=0.00 flip=0.31 same=0.08 score_us-bot=53.0-48.0 result=WIN
[DAgger-PvP] ep=194 step=080 beta=0.052 caught=080 first_cap=1 bc=1.1889 cluster=0.01 idle=0.01 flip=0.47 same=0.33 score_us-bot=38.2-55.0 result=LOSS
[DAgger-PvP] ep=195 step=064 beta=0.052 caught=080 first_cap=3 bc=1.1664 cluster=0.00 idle=0.00 

[DAgger-PvP] ep=199 step=056 beta=0.050 caught=080 first_cap=1 bc=1.1817 cluster=0.04 idle=0.02 flip=0.44 same=0.00 score_us-bot=44.0-51.0 result=LOSS
Lite checkpoint: C:\Users\aveexela\Desktop\rl_project\logs\checkpoints\agent_lite.pkl
Сохранено также в ./agent_lite.pkl
