In [None]:
import numpy as np
import torch
import torch.nn.functional as F
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.monitor import Monitor

# Load pretrained ConvAE (frozen) and AE stats
device = "cuda" if torch.cuda.is_available() else "cpu"
ae_model = ConvAE(feat_dim=X_unlabeled.shape[2], seq_len=X_unlabeled.shape[1], latent_dim=128)
ae_model.load_state_dict(torch.load("convAE_best.pth", map_location=device))
ae_model.to(device)
ae_model.eval()
for p in ae_model.parameters():
    p.requires_grad = False
print(f"‚úÖ Loaded ConvAE on {device} (frozen for Standard PPO).")

stats = np.load("convAE_stats.npz")
ae_mean, ae_std = stats["mean_err"], stats["std_err"]
print(f"‚úÖ AE normalization loaded: mean={ae_mean:.6f}, std={ae_std:.6f}")


# ============================================================================
# STEP 1: Reconstruct chronological order
# ============================================================================
print("\n" + "="*70)
print("üîÑ RECONSTRUCTING CHRONOLOGICAL ORDER")
print("="*70)

# Load labeled and unlabeled data
X_labeled = np.load("X_labeled.npy")
y_labeled = np.load("y_labeled.npy")
X_unlabeled = np.load("X_unlabeled.npy")
y_unlabeled = np.load("y_unlabeled.npy")

# Load indices
labeled_indices = np.load("labeled_indices.npy")
unlabeled_indices = np.load("unlabeled_indices.npy")
train_split_size = np.load("train_split_size.npy")[0]

print(f"üìä Data loaded:")
print(f"   Labeled samples: {len(X_labeled):,}")
print(f"   Unlabeled samples: {len(X_unlabeled):,}")
print(f"   Original training size: {train_split_size:,}")

# Reconstruct X and y in chronological order
X_train_reconstructed = np.zeros((train_split_size, *X_labeled.shape[1:]), dtype=X_labeled.dtype)
y_train_reconstructed = np.zeros(train_split_size, dtype=y_labeled.dtype)

# Place data back at original positions
X_train_reconstructed[labeled_indices] = X_labeled
y_train_reconstructed[labeled_indices] = y_labeled
X_train_reconstructed[unlabeled_indices] = X_unlabeled
y_train_reconstructed[unlabeled_indices] = y_unlabeled

# Create supervision mask (1 = labeled/supervised, 0 = unlabeled/unsupervised)
supervision_mask = np.zeros(train_split_size, dtype=np.int8)
supervision_mask[labeled_indices] = 1

print(f"\n‚úÖ Reconstructed training data:")
print(f"   Shape: {X_train_reconstructed.shape}")
print(f"   Supervised positions: {np.sum(supervision_mask):,} ({np.sum(supervision_mask)/train_split_size*100:.2f}%)")
print(f"   Unsupervised positions: {np.sum(1-supervision_mask):,} ({np.sum(1-supervision_mask)/train_split_size*100:.2f}%)")

# Verify reconstruction
y_train_original = np.load("y_train_seq.npy")
if np.array_equal(y_train_reconstructed, y_train_original):
    print("‚úÖ VERIFICATION PASSED: Reconstruction matches original!")
else:
    print("‚ö†Ô∏è  WARNING: Reconstruction mismatch detected!")

print("="*70 + "\n")


# ============================================================================
# STEP 2: PPO Environment (Chronological Sequential)
# ============================================================================

class PPOAEEnvChronological(gym.Env):
    """
    Sequential environment that processes windows in chronological order.
    Uses supervision_mask to determine which positions have labels.
    """
    def __init__(self, X_train, y_train, supervision_mask, ae_model,
                 embeddings, lambda_int=1.0, max_steps=2000):
        super().__init__()
        self.X_train = X_train
        self.y_train = y_train
        self.supervision_mask = supervision_mask
        self.ae_model = ae_model
        self.device = next(ae_model.parameters()).device
        self.lambda_int = lambda_int
        self.max_steps = max_steps
        self.steps = 0
        self.idx = 0
        self.embeddings = embeddings

        emb_dim = self.embeddings.shape[1]
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, 
            shape=(emb_dim + 1,), 
            dtype=np.float32
        )
        self.action_space = spaces.Discrete(2)  # 0 = normal, 1 = anomaly

        # Statistics
        n_supervised = np.sum(supervision_mask)
        n_unsupervised = len(supervision_mask) - n_supervised
        print(f"\nüîπ Environment initialized:")
        print(f"   Total windows: {len(X_train):,}")
        print(f"   Supervised: {n_supervised:,} ({n_supervised/len(X_train)*100:.2f}%)")
        print(f"   Unsupervised: {n_unsupervised:,} ({n_unsupervised/len(X_train)*100:.2f}%)")

    def _ae_error(self, x):
        """Compute normalized reconstruction error."""
        with torch.no_grad():
            x = torch.tensor(x, dtype=torch.float32).unsqueeze(0).to(self.device)
            recon = self.ae_model(x)
            loss = F.mse_loss(recon, x, reduction="mean").item()
        norm = np.tanh((loss - ae_mean) / (ae_std + 1e-8))
        return max(0, norm)

    def reset(self, *, seed=None, options=None):
        """Reset to start of sequence (or random position 10% of time)."""
        super().reset(seed=seed)
        self.steps = 0
        
        # Start from beginning or random position
        if np.random.rand() < 0.1:  # 10% random start for variety
            self.idx = np.random.randint(0, len(self.X_train))
        else:
            self.idx = 0
        
        emb = self.embeddings[self.idx]
        err = np.array([self._ae_error(self.X_train[self.idx])])
        obs = np.concatenate([emb, err]).astype(np.float32)
        return obs, {}

    def step(self, action):
        """Take action on current window, move to next chronologically."""
        self.steps += 1
        
        # Current window
        x = self.X_train[self.idx]
        true_label = self.y_train[self.idx]
        is_supervised = self.supervision_mask[self.idx]
        
        # Compute reconstruction error
        err = self._ae_error(x)
        
        # ========== REWARD CALCULATION ==========
        
        # External reward (only for supervised positions)
        if is_supervised == 1:
            # We have ground truth label
            external_reward = 1.0 if action == true_label else -0.5
        else:
            # No supervision - no external reward
            external_reward = 0.0
        
        # Intrinsic reward (based on reconstruction error)
        # Encourage anomaly prediction when error is high
        if action == 1:  # Predicted anomaly
            intrinsic_reward = self.lambda_int * err
            # Penalize weak false positives
            if err < 0.05:
                intrinsic_reward -= 0.5
        else:  # Predicted normal
            intrinsic_reward = 0.0
        
        # Total reward
        reward = external_reward + intrinsic_reward
        reward = np.clip(reward, -5, 5)
        
        # ========== MOVE TO NEXT WINDOW (CHRONOLOGICALLY) ==========
        self.idx = (self.idx + 1) % len(self.X_train)
        
        # Next observation
        next_emb = self.embeddings[self.idx]
        next_err = np.array([self._ae_error(self.X_train[self.idx])])
        obs = np.concatenate([next_emb, next_err]).astype(np.float32)
        
        done = self.steps >= self.max_steps
        
        return obs, reward, done, False, {}


# ============================================================================
# STEP 3: Precompute embeddings
# ============================================================================

def compute_embeddings(X):
    """Compute embeddings for all windows."""
    with torch.no_grad():
        tensors = torch.tensor(X, dtype=torch.float32).to(device)
        embeds = ae_model.encode(tensors).cpu().numpy()
    return embeds

print("üîπ Precomputing embeddings for reconstructed data...")
embeddings_train = compute_embeddings(X_train_reconstructed)
print(f"‚úÖ Embeddings computed: {embeddings_train.shape}")


# ============================================================================
# STEP 4: Create VecEnv + Standard PPO
# ============================================================================

def make_env(rank):
    def _init():
        env = PPOAEEnvChronological(
            X_train_reconstructed, 
            y_train_reconstructed, 
            supervision_mask,
            ae_model,
            embeddings_train,
            lambda_int=0.8, 
            max_steps=2000
        )
        return Monitor(env, f"logs/env_{rank}")
    return _init

num_envs = 1
vec_env = DummyVecEnv([make_env(i) for i in range(num_envs)])
vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=True, clip_obs=10.)

# Standard MLP policy (no LSTM)
policy_kwargs = dict(
    net_arch=dict(pi=[256, 256], vf=[512, 512, 256])  # Same architecture as recurrent, but no LSTM
)

model = PPO(
    "MlpPolicy",  # Standard MLP policy (non-recurrent)
    vec_env,
    verbose=1,
    learning_rate=1e-4,
    n_steps=128,
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    gae_lambda=0.95,
    ent_coef=0.02,
    clip_range=0.2,
    policy_kwargs=policy_kwargs,
    tensorboard_log="logs_standard_ppo_ae_chronological/"
)

print("\n" + "="*70)
print("üöÄ Starting Standard PPO training with chronological data")
print("="*70 + "\n")


# ============================================================================
# STEP 5: Train Standard PPO
# ============================================================================

model.learn(total_timesteps=500_000)
model.save("standard_ppo_ae_chronological")
vec_env.save("vec_normalize_standard_ppo_chronological.pkl")

print("\n‚úÖ Training complete! Chronological Standard PPO model saved.")
print("   Model: standard_ppo_ae_chronological.zip")
print("   VecNormalize: vec_normalize_standard_ppo_chronological.pkl")