In [1]:
#!/usr/bin/env python3
# ===============================================
# PP_15 - SEQUENCE TRANSFORMER (GOD SOTA 2026 v4)
# VERSION FINALE BULLETPROOF
# ===============================================
#
# CORRECTIONS v4:
# ‚úÖ Assert len == FEATURE_DIM
# ‚úÖ Split sur liste PR√â-FILTR√âE
# ‚úÖ AMP avec dtype explicite
# ‚úÖ match_id s√©curis√© str()
# ‚úÖ Dropout coh√©rent (self.dropout_p)
# ‚úÖ TF32 enabled
# + Toutes les corrections v3
#
# Output: features/sequence_transformer/
# ===============================================

import numpy as np
import polars as pl
from pathlib import Path
from datetime import datetime
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

try:
    from sklearn.metrics import roc_auc_score
    HAS_SKLEARN = True
except ImportError:
    HAS_SKLEARN = False

# ===============================================
# CONFIGURATION
# ===============================================
ROOT = Path(r"C:\Users\Administrateur\Tennis POLAR v2")
DATA_CLEAN = ROOT / "data_clean"
MATCHES_BASE = DATA_CLEAN / "matches_base"
OUTPUT_DIR = DATA_CLEAN / "features" / "sequence_transformer"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

SEQ_LENGTH = 20
FEATURE_DIM = 13  # 8 stats + 4 surface + 1 result + 1 opp_rank

D_MODEL = 64
N_HEADS = 4
N_LAYERS = 2
D_FF = 128
DROPOUT = 0.2
EMBEDDING_DIM = 32

EPOCHS = 100
LEARNING_RATE = 0.0001
BATCH_SIZE = 512
WEIGHT_DECAY = 1e-5
PATIENCE = 20
WARMUP_EPOCHS = 10

TRAIN_RATIO = 0.9
USE_AMP = True

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ‚úÖ TF32 pour boost perf
if DEVICE.type == "cuda":
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

print("=" * 70)
print("   PP_15 - SEQUENCE TRANSFORMER (GOD SOTA 2026 v4 FINAL)")
print("=" * 70)
print(f"   {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"   Device: {DEVICE}")
print(f"   TF32: {DEVICE.type == 'cuda'}")
print(f"   Feature dim: {FEATURE_DIM}")
print("=" * 70)


# ===============================================
# TRANSFORMER MODEL
# ===============================================

class TemporalPositionalEncoding(nn.Module):
    def __init__(self, d_model: int, max_days: int = 365):
        super().__init__()
        self.pos_embedding = nn.Embedding(SEQ_LENGTH + 1, d_model)
        self.days_proj = nn.Linear(1, d_model)
        self.max_days = max_days
        
        nn.init.normal_(self.pos_embedding.weight, std=0.01)
        nn.init.xavier_uniform_(self.days_proj.weight, gain=0.5)
    
    def forward(self, x, days_ago):
        batch_size, seq_len, _ = x.shape
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(batch_size, -1)
        pos_enc = self.pos_embedding(positions)
        
        days_clamped = torch.clamp(days_ago, 0, self.max_days * 3)
        days_normalized = days_clamped.float().unsqueeze(-1) / self.max_days
        days_enc = self.days_proj(days_normalized)
        
        return x + pos_enc + days_enc


class SequenceTransformer(nn.Module):
    def __init__(self, input_dim: int, d_model: int, n_heads: int,
                 n_layers: int, d_ff: int, dropout: float, output_dim: int):
        super().__init__()
        
        self.dropout_p = dropout  # ‚úÖ Stock√© pour utilisation coh√©rente
        
        self.input_proj = nn.Linear(input_dim, d_model)
        self.input_norm = nn.LayerNorm(d_model)
        self.pos_encoding = TemporalPositionalEncoding(d_model)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_ff,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        
        self.output_proj = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, output_dim)
        )
        
        self.cls_token = nn.Parameter(torch.randn(1, 1, d_model) * 0.01)
        
        nn.init.xavier_uniform_(self.input_proj.weight, gain=0.5)
        for m in self.output_proj:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight, gain=0.5)
    
    def forward(self, x, days_ago, mask=None):
        batch_size = x.shape[0]
        
        if torch.isnan(x).any():
            x = torch.nan_to_num(x, nan=0.0)
        
        x = self.input_proj(x)
        x = self.input_norm(x)
        x = F.dropout(x, p=self.dropout_p, training=self.training)  # ‚úÖ Dropout coh√©rent
        
        x = self.pos_encoding(x, days_ago)
        
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        x = torch.cat([cls_tokens, x], dim=1)
        
        if mask is not None:
            cls_mask = torch.zeros(batch_size, 1, dtype=torch.bool, device=x.device)
            mask = torch.cat([cls_mask, mask], dim=1)
        
        x = self.transformer(x, src_key_padding_mask=mask)
        cls_output = x[:, 0, :]
        embedding = self.output_proj(cls_output)
        embedding = torch.clamp(embedding, -10, 10)
        
        return embedding


class RankingPredictor(nn.Module):
    def __init__(self, embedding_dim: int):
        super().__init__()
        self.score_net = nn.Sequential(
            nn.Linear(embedding_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
        
        for m in self.score_net:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight, gain=0.5)
                nn.init.zeros_(m.bias)
    
    def forward(self, emb_a, emb_b):
        score_a = self.score_net(emb_a).squeeze(-1)
        score_b = self.score_net(emb_b).squeeze(-1)
        return torch.clamp(score_a - score_b, -20, 20)


# ===============================================
# DATA - Construction single pass
# ===============================================

def build_dataset_single_pass(matches_df: pl.DataFrame):
    """Single pass chronologique avec liste PR√â-FILTR√âE."""
    
    print("\n[Dataset] Building dataset (single pass, pre-filtered)...")
    
    # ‚úÖ PR√â-FILTRER avant split pour avoir un cutoff exact
    matches_df = matches_df.sort("tourney_date_ta")
    all_matches = matches_df.to_dicts()
    
    matches_list = [
        r for r in all_matches
        if r["winner_id"] and r["loser_id"] and r["tourney_date_ta"]
    ]
    
    # ‚úÖ Split sur liste filtr√©e
    split_idx = int(len(matches_list) * TRAIN_RATIO)
    cutoff_date = matches_list[split_idx]["tourney_date_ta"]
    print(f"  Cutoff date: {cutoff_date} (matches on this date ignored)")
    print(f"  Valid matches: {len(matches_list):,}")
    print(f"  Train: {split_idx:,} | Val: {len(matches_list) - split_idx:,}")
    
    if len(matches_list) > 0 and split_idx > 0 and split_idx < len(matches_list):
        print(f"  Train period: {matches_list[0]['tourney_date_ta']} ‚Üí {matches_list[split_idx-1]['tourney_date_ta']}")
        print(f"  Val period: {matches_list[split_idx]['tourney_date_ta']} ‚Üí {matches_list[-1]['tourney_date_ta']}")
    
    winner_features = [
        "w_s_ace_p", "w_s_df_p", "w_s_1stIn_p", "w_s_1stWon_p", 
        "w_s_2ndWon_p", "w_ret_1stWon_p", "w_ret_2ndWon_p", "w_bp_conv_p"
    ]
    loser_features = [
        "l_s_ace_p", "l_s_df_p", "l_s_1stIn_p", "l_s_1stWon_p",
        "l_s_2ndWon_p", "l_ret_1stWon_p", "l_ret_2ndWon_p", "l_bp_conv_p"
    ]
    
    player_history = defaultdict(list)
    train_data = []
    val_data = []
    
    for idx, row in enumerate(matches_list):
        match_id = row["custom_match_id"]
        winner_id = row["winner_id"]
        loser_id = row["loser_id"]
        match_date = row["tourney_date_ta"]
        
        surface = row.get("tourney_surface_ta") or row.get("surface", "Hard")
        surface_enc = {
            "Hard": [1, 0, 0, 0],
            "Clay": [0, 1, 0, 0],
            "Grass": [0, 0, 1, 0],
            "Carpet": [0, 0, 0, 1]
        }.get(surface, [0.25, 0.25, 0.25, 0.25])
        
        winner_rank = row.get("winner_rank_ta", 500)
        loser_rank = row.get("loser_rank_ta", 500)
        if winner_rank is None or np.isnan(winner_rank): winner_rank = 500
        if loser_rank is None or np.isnan(loser_rank): loser_rank = 500
        
        # R√©cup√©rer s√©quences AVANT ce match
        w_past = player_history[winner_id][-SEQ_LENGTH:]
        l_past = player_history[loser_id][-SEQ_LENGTH:]
        
        if len(w_past) >= 5 and len(l_past) >= 5:
            w_feats = np.array([x[1] for x in w_past], dtype=np.float32)
            l_feats = np.array([x[1] for x in l_past], dtype=np.float32)
            
            w_days = np.array([
                min((match_date - x[0]).days, 1000) if x[0] else 30 
                for x in w_past
            ], dtype=np.float32)
            l_days = np.array([
                min((match_date - x[0]).days, 1000) if x[0] else 30 
                for x in l_past
            ], dtype=np.float32)
            
            item_forward = {
                "match_id": match_id,
                "seq_a": w_feats,
                "seq_b": l_feats,
                "days_a": w_days,
                "days_b": l_days,
                "len_a": len(w_past),
                "len_b": len(l_past),
                "label": 1,
            }
            item_reverse = {
                "match_id": match_id,
                "seq_a": l_feats,
                "seq_b": w_feats,
                "days_a": l_days,
                "days_b": w_days,
                "len_a": len(l_past),
                "len_b": len(w_past),
                "label": 0,
            }
            
            if match_date < cutoff_date:
                train_data.append(item_forward)
                train_data.append(item_reverse)
            elif match_date > cutoff_date:
                val_data.append(item_forward)
                val_data.append(item_reverse)
        
        # Construire features pour CE match
        w_match_feats = []
        for f in winner_features:
            val = row.get(f, None)
            if val is not None and not np.isnan(val):
                val = np.clip(val, 0, 1)
            else:
                val = 0.5
            w_match_feats.append(val)
        w_match_feats.extend(surface_enc)
        w_match_feats.append(np.log1p(loser_rank) / np.log1p(2000))
        
        l_match_feats = []
        for f in loser_features:
            val = row.get(f, None)
            if val is not None and not np.isnan(val):
                val = np.clip(val, 0, 1)
            else:
                val = 0.5
            l_match_feats.append(val)
        l_match_feats.extend(surface_enc)
        l_match_feats.append(np.log1p(winner_rank) / np.log1p(2000))

        
        # ‚úÖ ASSERT pour v√©rifier la taille
        assert len(w_match_feats) == FEATURE_DIM, f"w_match_feats={len(w_match_feats)}, expected {FEATURE_DIM}"
        assert len(l_match_feats) == FEATURE_DIM, f"l_match_feats={len(l_match_feats)}, expected {FEATURE_DIM}"
        
        player_history[winner_id].append((match_date, w_match_feats))
        player_history[loser_id].append((match_date, l_match_feats))
    
    print(f"  Train dataset: {len(train_data):,}")
    print(f"  Val dataset: {len(val_data):,}")
    
    return train_data, val_data


class SequenceDataset(Dataset):
    def __init__(self, data: list, seq_length: int, feature_dim: int, return_match_id: bool = False):
        self.data = data
        self.seq_length = seq_length
        self.feature_dim = feature_dim
        self.return_match_id = return_match_id
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        seq_a = self._pad_sequence(item["seq_a"])
        seq_b = self._pad_sequence(item["seq_b"])
        days_a = self._pad_days(item["days_a"])
        days_b = self._pad_days(item["days_b"])
        
        len_a = min(item["len_a"], self.seq_length)
        len_b = min(item["len_b"], self.seq_length)
        pad_len_a = max(0, self.seq_length - len_a)
        pad_len_b = max(0, self.seq_length - len_b)
        
        mask_a = np.zeros(self.seq_length, dtype=bool)
        mask_a[:pad_len_a] = True
        
        mask_b = np.zeros(self.seq_length, dtype=bool)
        mask_b[:pad_len_b] = True
        
        result = {
            "seq_a": torch.tensor(seq_a, dtype=torch.float),
            "seq_b": torch.tensor(seq_b, dtype=torch.float),
            "days_a": torch.tensor(days_a, dtype=torch.float),
            "days_b": torch.tensor(days_b, dtype=torch.float),
            "mask_a": torch.tensor(mask_a, dtype=torch.bool),
            "mask_b": torch.tensor(mask_b, dtype=torch.bool),
            "label": torch.tensor(item["label"], dtype=torch.float),
        }
        
        if self.return_match_id:
            result["match_id"] = item["match_id"]
            result["is_forward"] = item["label"]
        
        return result
    
    def _pad_sequence(self, seq):
        if len(seq) >= self.seq_length:
            return seq[-self.seq_length:]
        else:
            pad = np.zeros((self.seq_length - len(seq), self.feature_dim), dtype=np.float32)
            return np.concatenate([pad, seq], axis=0)
    
    def _pad_days(self, days):
        if len(days) >= self.seq_length:
            return days[-self.seq_length:]
        else:
            pad = np.full(self.seq_length - len(days), 365, dtype=np.float32)
            return np.concatenate([pad, days], axis=0)


# ===============================================
# TRAINING
# ===============================================

def compute_auc(y_true, y_proba):
    if HAS_SKLEARN and len(np.unique(y_true)) > 1:
        try:
            return roc_auc_score(y_true, y_proba)
        except:
            return 0.5
    return 0.5


def get_amp_dtype():
    """‚úÖ Dtype explicite pour AMP."""
    if torch.cuda.is_available():
        if torch.cuda.is_bf16_supported():
            return torch.bfloat16
        return torch.float16
    return torch.float32


def train_transformer(train_loader, val_loader):
    print("\n" + "=" * 50)
    print("  TRAINING SEQUENCE TRANSFORMER (v4 FINAL)")
    print("=" * 50)
    
    transformer = SequenceTransformer(
        input_dim=FEATURE_DIM,
        d_model=D_MODEL,
        n_heads=N_HEADS,
        n_layers=N_LAYERS,
        d_ff=D_FF,
        dropout=DROPOUT,
        output_dim=EMBEDDING_DIM
    ).to(DEVICE)
    
    predictor = RankingPredictor(EMBEDDING_DIM).to(DEVICE)
    
    criterion = nn.BCEWithLogitsLoss()
    params = list(transformer.parameters()) + list(predictor.parameters())
    optimizer = torch.optim.AdamW(params, lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    
    def lr_lambda(epoch):
        if epoch < WARMUP_EPOCHS:
            return (epoch + 1) / WARMUP_EPOCHS
        return 1.0
    
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
    
    # ‚úÖ AMP avec dtype explicite
    use_amp = USE_AMP and DEVICE.type == 'cuda'
    amp_dtype = get_amp_dtype() if use_amp else torch.float32
    scaler = torch.cuda.amp.GradScaler(enabled=(use_amp and amp_dtype == torch.float16))
    
    print(f"  Params: {sum(p.numel() for p in params):,}")
    print(f"  AMP: {use_amp} (dtype={amp_dtype})")
    
    best_val_auc = 0
    patience_counter = 0
    best_state = None
    best_epoch = -1

    for epoch in range(EPOCHS):
        # Training
        transformer.train()
        predictor.train()
        
        train_losses = []
        train_preds_all = []
        train_labels_all = []
        
        for batch in train_loader:
            seq_a = batch["seq_a"].to(DEVICE)
            seq_b = batch["seq_b"].to(DEVICE)
            days_a = batch["days_a"].to(DEVICE)
            days_b = batch["days_b"].to(DEVICE)
            mask_a = batch["mask_a"].to(DEVICE)
            mask_b = batch["mask_b"].to(DEVICE)
            labels = batch["label"].to(DEVICE)
            
            optimizer.zero_grad()
            
            if use_amp:
                # ‚úÖ AMP avec dtype explicite
                with torch.cuda.amp.autocast(dtype=amp_dtype):
                    emb_a = transformer(seq_a, days_a, mask_a)
                    emb_b = transformer(seq_b, days_b, mask_b)
                    logits = predictor(emb_a, emb_b)
                    loss = criterion(logits, labels)
                
                if torch.isnan(loss):
                    continue
                
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(params, 0.5)
                scaler.step(optimizer)
                scaler.update()
            else:
                emb_a = transformer(seq_a, days_a, mask_a)
                emb_b = transformer(seq_b, days_b, mask_b)
                
                if torch.isnan(emb_a).any() or torch.isnan(emb_b).any():
                    continue
                
                logits = predictor(emb_a, emb_b)
                loss = criterion(logits, labels)
                
                if torch.isnan(loss):
                    continue
                
                loss.backward()
                torch.nn.utils.clip_grad_norm_(params, 0.5)
                optimizer.step()
            
            train_losses.append(loss.item())
            with torch.no_grad():
                train_preds_all.extend(torch.sigmoid(logits).float().cpu().numpy())
            train_labels_all.extend(labels.float().cpu().numpy())
        
        if len(train_losses) == 0:
            continue
        
        train_loss = np.mean(train_losses)
        train_auc = compute_auc(np.array(train_labels_all), np.array(train_preds_all))
        
        # Validation
        transformer.eval()
        predictor.eval()
        
        val_losses = []
        val_preds_all = []
        val_labels_all = []
        
        with torch.no_grad():
            for batch in val_loader:
                seq_a = batch["seq_a"].to(DEVICE)
                seq_b = batch["seq_b"].to(DEVICE)
                days_a = batch["days_a"].to(DEVICE)
                days_b = batch["days_b"].to(DEVICE)
                mask_a = batch["mask_a"].to(DEVICE)
                mask_b = batch["mask_b"].to(DEVICE)
                labels = batch["label"].to(DEVICE)
                
                if use_amp:
                    with torch.cuda.amp.autocast(dtype=amp_dtype):
                        emb_a = transformer(seq_a, days_a, mask_a)
                        emb_b = transformer(seq_b, days_b, mask_b)
                        logits = predictor(emb_a, emb_b)
                        loss = criterion(logits, labels)
                else:
                    emb_a = transformer(seq_a, days_a, mask_a)
                    emb_b = transformer(seq_b, days_b, mask_b)
                    logits = predictor(emb_a, emb_b)
                    loss = criterion(logits, labels)
                
                val_losses.append(loss.item())
                val_preds_all.extend(torch.sigmoid(logits).float().cpu().numpy())
                val_labels_all.extend(labels.cpu().numpy())
        
        val_loss = np.mean(val_losses)
        val_preds_arr = np.array(val_preds_all)
        val_labels_arr = np.array(val_labels_all)
        val_auc = compute_auc(val_labels_arr, val_preds_arr)
        val_acc = ((val_preds_arr > 0.5) == val_labels_arr).mean()
        
        scheduler.step()
        
        if (epoch + 1) % 5 == 0:
            print(f"  Epoch {epoch+1:3d}: loss={train_loss:.4f}, AUC={train_auc:.4f} | val_loss={val_loss:.4f}, val_acc={val_acc:.4f}, val_AUC={val_auc:.4f}")
        
        if val_auc > best_val_auc + 0.001:
            best_val_auc = val_auc
            best_epoch = epoch + 1
            patience_counter = 0
            best_state = {
                'transformer': transformer.state_dict(),
                'predictor': predictor.state_dict()
            }
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print(f"  Early stopping at epoch {epoch+1}")
                break
    
    if best_state:
        transformer.load_state_dict(best_state['transformer'])
        predictor.load_state_dict(best_state['predictor'])
    
    print(f"\n  ‚úÖ Best val AUC: {best_val_auc:.4f} (epoch {best_epoch})")
    
    return transformer, predictor


# ===============================================
# FEATURE EXTRACTION (batch√©e avec mask)
# ===============================================

def extract_sequence_features_batched(transformer, all_data, matches_df):
    print("\n[Features] Extracting sequence features (batched)...")
    
    transformer.eval()
    
    forward_data = [item for item in all_data if item["label"] == 1]
    
    if len(forward_data) == 0:
        print("  ‚ö†Ô∏è No data to extract")
        return pl.DataFrame()
    
    extract_dataset = SequenceDataset(forward_data, SEQ_LENGTH, FEATURE_DIM, return_match_id=True)
    extract_loader = DataLoader(extract_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
    
    match_embeddings = {}
    
    use_amp = USE_AMP and DEVICE.type == 'cuda'
    amp_dtype = get_amp_dtype() if use_amp else torch.float32
    
    with torch.no_grad():
        for batch in extract_loader:
            seq_a = batch["seq_a"].to(DEVICE)
            seq_b = batch["seq_b"].to(DEVICE)
            days_a = batch["days_a"].to(DEVICE)
            days_b = batch["days_b"].to(DEVICE)
            mask_a = batch["mask_a"].to(DEVICE)
            mask_b = batch["mask_b"].to(DEVICE)
            match_ids = batch["match_id"]
            
            if use_amp:
                with torch.cuda.amp.autocast(dtype=amp_dtype):
                    emb_a = transformer(seq_a, days_a, mask_a)
                    emb_b = transformer(seq_b, days_b, mask_b)
            else:
                emb_a = transformer(seq_a, days_a, mask_a)
                emb_b = transformer(seq_b, days_b, mask_b)
            
            emb_a_np = emb_a.float().cpu().numpy()  # Convert to float32 for numpy
            emb_b_np = emb_b.float().cpu().numpy()
            
            for i, mid in enumerate(match_ids):
                # ‚úÖ S√©curiser match_id en str
                mid_str = str(mid)
                emb_w = np.nan_to_num(emb_a_np[i], nan=0.0)
                emb_l = np.nan_to_num(emb_b_np[i], nan=0.0)
                match_embeddings[mid_str] = (emb_w, emb_l)
    
    print(f"  Extracted: {len(match_embeddings):,} matches")
    
    # Cr√©er features
    results = []
    
    for row in matches_df.iter_rows(named=True):
        match_id = str(row["custom_match_id"])  # ‚úÖ S√©curiser
        result = {"custom_match_id": row["custom_match_id"]}
        
        if match_id in match_embeddings:
            emb_w, emb_l = match_embeddings[match_id]
            
            norm_w = np.linalg.norm(emb_w)
            norm_l = np.linalg.norm(emb_l)
            
            if norm_w > 1e-6 and norm_l > 1e-6:
                result["seq_cosine_sim"] = float(np.dot(emb_w, emb_l) / (norm_w * norm_l))
            else:
                result["seq_cosine_sim"] = 0.0
            
            result["seq_l2_distance"] = float(np.linalg.norm(emb_w - emb_l))
            
            emb_diff = emb_w - emb_l
            for k in range(min(8, EMBEDDING_DIM)):
                result[f"seq_diff_{k}"] = float(emb_diff[k])
            
            result["seq_norm_winner"] = float(norm_w)
            result["seq_norm_loser"] = float(norm_l)
            result["has_sequence"] = 1
        else:
            result["seq_cosine_sim"] = None
            result["seq_l2_distance"] = None
            for k in range(min(8, EMBEDDING_DIM)):
                result[f"seq_diff_{k}"] = None
            result["seq_norm_winner"] = None
            result["seq_norm_loser"] = None
            result["has_sequence"] = 0
        
        results.append(result)
    
    features_df = pl.DataFrame(results, infer_schema_length=None)
    coverage = features_df["has_sequence"].mean()
    print(f"  Coverage: {coverage:.1%}")
    
    return features_df


# ===============================================
# MAIN
# ===============================================

def main():
    t0 = datetime.now()
    
    print("\n[1/5] Loading matches...")
    matches_df = pl.read_parquet(MATCHES_BASE)
    print(f"  Matches: {len(matches_df):,}")
    
    print("\n[2/5] Building dataset...")
    train_data, val_data = build_dataset_single_pass(matches_df)
    
    train_loader = DataLoader(
        SequenceDataset(train_data, SEQ_LENGTH, FEATURE_DIM),
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=0
    )
    val_loader = DataLoader(
        SequenceDataset(val_data, SEQ_LENGTH, FEATURE_DIM),
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=0
    )
    
    print("\n[3/5] Training transformer...")
    transformer, predictor = train_transformer(train_loader, val_loader)
    
    print("\n[4/5] Extracting features...")
    all_data = train_data + val_data
    features_df = extract_sequence_features_batched(transformer, all_data, matches_df)
    
    print("\n[5/5] Saving...")
    output_path = OUTPUT_DIR / "sequence_features.parquet"
    features_df.write_parquet(output_path)
    print(f"  ‚úÖ Saved: {output_path}")
    
    torch.save({
        'transformer': transformer.state_dict(),
        'predictor': predictor.state_dict(),
        'config': {
            'd_model': D_MODEL,
            'n_heads': N_HEADS,
            'n_layers': N_LAYERS,
            'embedding_dim': EMBEDDING_DIM,
            'seq_length': SEQ_LENGTH,
            'feature_dim': FEATURE_DIM,
        }
    }, OUTPUT_DIR / "transformer_model.pt")
    
    elapsed = (datetime.now() - t0).total_seconds()
    
    print("\n" + "=" * 70)
    print("   ‚úÖ PP_15 SEQUENCE TRANSFORMER GOD SOTA v4 COMPLETE!")
    print("=" * 70)
    print(f"   ‚è±Ô∏è  Time: {elapsed:.1f}s")
    print(f"   üìä Train: {len(train_data):,} | Val: {len(val_data):,}")
    print("""
üìã v4 FINAL CORRECTIONS:
   ‚úÖ Assert len == FEATURE_DIM
   ‚úÖ Split sur liste PR√â-FILTR√âE
   ‚úÖ AMP dtype explicite (bf16/fp16)
   ‚úÖ match_id s√©curis√© str()
   ‚úÖ Dropout coh√©rent (self.dropout_p)
   ‚úÖ TF32 enabled
   ‚úÖ Single pass chronologique
   ‚úÖ Mask dans extraction
   ‚úÖ Extraction batch√©e
   ‚úÖ LayerNorm

üîÑ NEXT: PP_16 (Merge GOD Features)
""")


if __name__ == "__main__":
    main()

   PP_15 - SEQUENCE TRANSFORMER (GOD SOTA 2026 v4 FINAL)
   2025-12-17 08:41:19
   Device: cuda
   TF32: True
   Feature dim: 13

[1/5] Loading matches...
  Matches: 544,245

[2/5] Building dataset...

[Dataset] Building dataset (single pass, pre-filtered)...
  Cutoff date: 2024-01-22 (matches on this date ignored)
  Valid matches: 544,245
  Train: 489,820 | Val: 54,425
  Train period: 1942-08-01 ‚Üí 2024-01-22
  Val period: 2024-01-22 ‚Üí 2025-09-01
  Train dataset: 873,736
  Val dataset: 100,378

[3/5] Training transformer...

  TRAINING SEQUENCE TRANSFORMER (v4 FINAL)
  Params: 79,969
  AMP: True (dtype=torch.bfloat16)
  Epoch   5: loss=0.6121, AUC=0.7253 | val_loss=0.5888, val_acc=0.6928, val_AUC=0.7581
  Epoch  10: loss=0.5023, AUC=0.8343 | val_loss=0.4600, val_acc=0.7920, val_AUC=0.8775
  Epoch  15: loss=0.4576, AUC=0.8653 | val_loss=0.4126, val_acc=0.8082, val_AUC=0.8967
  Epoch  20: loss=0.4468, AUC=0.8719 | val_loss=0.4180, val_acc=0.8080, val_AUC=0.8968
  Epoch  25: loss=0.44