## Implementacja

Implementacja modelu Expected Threat wykorzystującego:
- **Transformer encoder** (12 warstw, 512 d_model) do przetwarzania sekwencji zdarzeń
- **Mixture Density Network (MDN)** z 5 komponentami Gaussowskimi do predykcji pozycji
- **Fourier position encoding** dla płynnej reprezentacji współrzędnych na boisku
- **Autoregresywna predykcja** z causal masking - każdy event przewiduje następny

Podejście eliminuje dyskretyzację siatki i pozwala modelowi przewidywać realistyczne, ciągłe rozkłady pozycji.

In [1]:
import pandas as pd
import numpy as np
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn.functional as F 

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

PyTorch: 2.5.1+cu121
CUDA: True
GPU: NVIDIA GeForce RTX 4060 Laptop GPU


In [2]:
df_train = pd.read_parquet('data/sequences_continuous_train_balanced.parquet')
df_val = pd.read_parquet('data/sequences_continuous_val_natural.parquet')

print("=" * 60)
print("TRAIN SET")
print("=" * 60)
print(f"Sequences: {len(df_train):,}")
print(f"Goals: {df_train['goal'].sum():,} ({df_train['goal'].mean()*100:.1f}%)")

print("\n" + "=" * 60)
print("VALIDATION SET")
print("=" * 60)
print(f"Sequences: {len(df_val):,}")
print(f"Goals: {df_val['goal'].sum():,} ({df_val['goal'].mean()*100:.1f}%)")

# Load vocabulary
with open('data/vocab_continuous.json', 'r') as f:
    type_vocab = json.load(f)

with open('data/id_to_type_continuous.json', 'r') as f:
    id_to_type = json.load(f)
    id_to_type = {int(k): v for k, v in id_to_type.items()}

print(f"Vocab size: {len(type_vocab)}")
print(f"\nFirst sequence:")
print(df_train['events'].iloc[0][:5])

TRAIN SET
Sequences: 73,436
Goals: 3,736 (5.1%)

VALIDATION SET
Sequences: 51,219
Goals: 666 (1.3%)
Vocab size: 5

First sequence:
[{'end_x': 0.93, 'end_y': 0.5225, 'type': 'Pass', 'x': 0.70833, 'y': 0.3275}
 {'end_x': None, 'end_y': None, 'type': 'Shot', 'x': 0.895, 'y': 0.42625}
 {'end_x': None, 'end_y': None, 'type': 'GOAL', 'x': None, 'y': None}]


In [3]:
# Pokaż przykład z GOAL i NO_GOAL
goal_seq = df_train[df_train['goal'] == 1]['events'].iloc[0]
print(f"\nGOAL sequence (last 2 events):")
print(goal_seq[-2:])  # powinno być Shot → GOAL

no_goal_seq = df_train[df_train['goal'] == 0]['events'].iloc[0]
print(f"\nNO_GOAL sequence (last event):")
print(no_goal_seq[-2:])  # powinno być Shot/Pass → NO_GOAL


GOAL sequence (last 2 events):
[{'end_x': None, 'end_y': None, 'type': 'Shot', 'x': 0.895, 'y': 0.42625}
 {'end_x': None, 'end_y': None, 'type': 'GOAL', 'x': None, 'y': None}]

NO_GOAL sequence (last event):
[{'end_x': 0.90417, 'end_y': 0.005, 'type': 'Pass', 'x': 0.5825, 'y': 0.85375}
 {'end_x': None, 'end_y': None, 'type': 'NO_GOAL', 'x': None, 'y': None}]


In [4]:
# Sprawdź czy GOAL zawsze po Shot
goal_sequences = df_train[df_train['goal'] == 1]['events']

shots_before_goal = []
for seq in goal_sequences:
    if len(seq) >= 2:
        shots_before_goal.append(seq[-2]['type'])

from collections import Counter
print(Counter(shots_before_goal))

print(f"Sequence length stats:")
print(df_train['sequence_length'].describe())

Counter({'Shot': 3736})
Sequence length stats:
count    73436.000000
mean         5.856719
std          3.537457
min          2.000000
25%          3.000000
50%          5.000000
75%          8.000000
max         13.000000
Name: sequence_length, dtype: float64


### Przygotowanie danych dla modelu

Dataset implementuje causal shift: input to eventy [0, 1, ..., n-1], target to [1, 2, ..., n], gdzie każdy event przewiduje następny. 

Każdy event reprezentowany jako:
- **type_id** - typ zdarzenia (Pass, Shot, GOAL, etc.)
- **positions** - wektor [start_x, start_y, end_x, end_y] znormalizowany do [0,1]
- **start_mask** - czy event ma pozycję początkową (Pass/Shot mają, GOAL/NO_GOAL nie)
- **end_mask** - czy event ma pozycję końcową (tylko Pass)

Padding do max_seq_len=14 z ignore_index=-100 dla loss.

In [5]:
class ContinuousXTDataset(Dataset):
    def __init__(self, df, type_vocab, max_seq_len=14):
        self.df = df.reset_index(drop=True)
        self.type_vocab = type_vocab
        self.max_seq_len = max_seq_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        events = row['events']
        
        # Causal shift: input predicts next token
        input_events = events[:-1]   # [0, 1, 2, ..., n-1]
        target_events = events[1:]   # [1, 2, 3, ..., n]
        
        seq_len = len(input_events)  # rzeczywista długość (bez paddingu)

        input_type_ids = [self.type_vocab[e['type']] for e in input_events]
        target_type_ids = [self.type_vocab[e['type']] for e in target_events]

        # Build position tensors [start_x, start_y, end_x, end_y]
        def _build_positions(events_list):
            positions = []
            for e in events_list:
                pos = [
                    e['x'] if e['x'] is not None else 0.0,
                    e['y'] if e['y'] is not None else 0.0,
                    e['end_x'] if e['end_x'] is not None else 0.0,
                    e['end_y'] if e['end_y'] is not None else 0.0
                ]
                positions.append(pos)
            return positions

        input_positions = _build_positions(input_events)
        target_positions = _build_positions(target_events)

        # Build masks
        def _build_masks(events_list):
            start_masks = []
            end_masks = []
            for e in events_list:
                # start_mask: True if event has position (Pass/Shot)
                has_start = e['x'] is not None
                start_masks.append(has_start)
                
                # end_mask: True only for Pass (has end_x, end_y)
                has_end = e['end_x'] is not None
                end_masks.append(has_end)
            
            return start_masks, end_masks
        
        input_start_mask, input_end_mask = _build_masks(input_events)
        target_start_mask, target_end_mask = _build_masks(target_events)

        # Padding
        pad_len = self.max_seq_len - seq_len
        
        # Pad type IDs
        input_type_ids += [self.type_vocab['<pad>']] * pad_len
        target_type_ids += [-100] * pad_len  # ignore_index for loss
        
        # Pad positions (zeros)
        input_positions += [[0.0, 0.0, 0.0, 0.0]] * pad_len
        target_positions += [[0.0, 0.0, 0.0, 0.0]] * pad_len
        
        # Pad masks (False)
        input_start_mask += [False] * pad_len
        input_end_mask += [False] * pad_len
        target_start_mask += [False] * pad_len
        target_end_mask += [False] * pad_len
        
        # Convert to tensors
        return {
            'input_types': torch.tensor(input_type_ids, dtype=torch.long),
            'input_positions': torch.tensor(input_positions, dtype=torch.float32),
            'input_start_mask': torch.tensor(input_start_mask, dtype=torch.bool),
            'input_end_mask': torch.tensor(input_end_mask, dtype=torch.bool),
            
            'target_types': torch.tensor(target_type_ids, dtype=torch.long),
            'target_positions': torch.tensor(target_positions, dtype=torch.float32),
            'target_start_mask': torch.tensor(target_start_mask, dtype=torch.bool),
            'target_end_mask': torch.tensor(target_end_mask, dtype=torch.bool)
        }

In [6]:
# Test dataset
train_dataset = ContinuousXTDataset(df_train, type_vocab)
val_dataset = ContinuousXTDataset(df_val, type_vocab)

print(f"Train size: {len(train_dataset)}")
print(f"Val size: {len(val_dataset)}")

# Test jednej próbki
sample = train_dataset[0]
print(f"\nSample shapes:")
for k, v in sample.items():
    print(f"  {k}: {v.shape} ({v.dtype})")

print(f"\nInput types: {sample['input_types'][:5]}")
print(f"Target types: {sample['target_types'][:5]}")

Train size: 73436
Val size: 51219

Sample shapes:
  input_types: torch.Size([14]) (torch.int64)
  input_positions: torch.Size([14, 4]) (torch.float32)
  input_start_mask: torch.Size([14]) (torch.bool)
  input_end_mask: torch.Size([14]) (torch.bool)
  target_types: torch.Size([14]) (torch.int64)
  target_positions: torch.Size([14, 4]) (torch.float32)
  target_start_mask: torch.Size([14]) (torch.bool)
  target_end_mask: torch.Size([14]) (torch.bool)

Input types: tensor([0, 1, 4, 4, 4])
Target types: tensor([   1,    2, -100, -100, -100])


In [7]:
train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=0,
    pin_memory=True if torch.cuda.is_available() else False
)

val_loader = DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False,
    num_workers=0,
    pin_memory=True if torch.cuda.is_available() else False
)

# Test batch
batch = next(iter(train_loader))
print(f"\nBatch shapes:")
for k, v in batch.items():
    print(f"  {k}: {v.shape}")


Batch shapes:
  input_types: torch.Size([32, 14])
  input_positions: torch.Size([32, 14, 4])
  input_start_mask: torch.Size([32, 14])
  input_end_mask: torch.Size([32, 14])
  target_types: torch.Size([32, 14])
  target_positions: torch.Size([32, 14, 4])
  target_start_mask: torch.Size([32, 14])
  target_end_mask: torch.Size([32, 14])


### Fourier Position Encoding

Zamiast standardowego embedowania współrzędnych, używamy kodowania Fouriera dla 4 współrzędnych [start_x, start_y, end_x, end_y]:
- 8 częstotliwości: [1, 2, 4, 8, 16, 32, 64, 128]
- sin i cos dla każdej częstotliwości → 4 coords × 8 freqs × 2 = 64 cechy
- Projekcja liniowa do d_model=512

Fourier features pozwalają na płynną interpolację przestrzenną - model lepiej rozumie podobieństwo pozycji bliskich sobie na boisku.

In [8]:
class FourierPositionEncoder(nn.Module):
    def __init__(self, freqs=[1,2,4,8,16,32,64,128], d_model=512):
        super().__init__()
        self.freqs = torch.tensor(freqs, dtype=torch.float32)
        # 4 coords × 8 freqs × 2 (sin/cos) = 64
        self.proj = nn.Linear(64, d_model)
    
    def forward(self, pos):
        # pos: [B, T, 4]
        B, T, _ = pos.shape
        
        # Expand: [B,T,4] → [B,T,4,1] × [8] → [B,T,4,8]
        freqs = self.freqs.to(pos.device)
        pos_expanded = pos.unsqueeze(-1)  # [B,T,4,1]
        angles = pos_expanded * freqs  # [B,T,4,8]
        
        # sin/cos
        sin_features = torch.sin(angles)  # [B,T,4,8]
        cos_features = torch.cos(angles)  # [B,T,4,8]
        
        # Flatten: [B,T,4,8,2] → [B,T,64]
        fourier = torch.stack([sin_features, cos_features], dim=-1)  # [B,T,4,8,2]
        fourier = fourier.reshape(B, T, -1)  # [B,T,64]
        
        return self.proj(fourier)  # [B,T,512]

In [9]:
encoder = FourierPositionEncoder()
pos = torch.rand(2, 5, 4)  # batch=2, seq=5
out = encoder(pos)
print(out.shape)  # torch.Size([2, 5, 512])

torch.Size([2, 5, 512])


In [10]:
sample_pos = batch['input_positions'][:2, :5]  # [2, 5, 4]
encoded = encoder(sample_pos)
print(f"Input: {sample_pos.shape}")
print(f"Output: {encoded.shape}")
print(f"Sample values: {encoded[0,0,:5]}")

Input: torch.Size([2, 5, 4])
Output: torch.Size([2, 5, 512])
Sample values: tensor([-0.4461, -0.0095,  0.1573,  0.2313, -0.8890], grad_fn=<SliceBackward0>)


### Architektura modelu: Transformer + MDN

Model składa się z:
1. **Embeddings**: type embedding + Fourier position encoding
2. **Transformer encoder** z causal masking (12 warstw, 8 head, d_model=512)
3. **Dwie głowice predykcyjne**:
   - Type head: logity dla vocab_size=7 (Pass, Shot, GOAL, etc.)
   - MDN head: parametry dla n_components=5 Gaussianów (8 parametrów × 5 = 40 wartości)

MDN przewiduje rozkład prawdopodobieństwa pozycji jako mieszankę 5 Gaussianów, każdy z wagą i parametrami dla start position (μ, σ) oraz end position (μx, μy, σx, σy).

In [11]:
class ContinuousXTModel(nn.Module):
    def __init__(self, vocab_size=6, d_model=512, nhead=8, num_layers=12, n_components=3):
        super().__init__()
        
        self.n_components = n_components  # Zapisz jako atrybut
        
        # Embeddings
        self.type_embedding = nn.Embedding(vocab_size, d_model)
        self.position_encoder = FourierPositionEncoder(d_model=d_model)
        
        # Transformer
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=2048,
            dropout=0.1,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # Prediction heads
        self.type_head = nn.Linear(d_model, vocab_size)
        self.mdn_head = nn.Linear(d_model, n_components * 8)  # n_components × 8
    
    def forward(self, types, positions, start_mask):
        # Embeddings
        type_emb = self.type_embedding(types)
        pos_emb = self.position_encoder(positions)
        
        # Mask positions for GOAL/NO_GOAL
        pos_emb = pos_emb * start_mask.unsqueeze(-1).float()
        combined = type_emb + pos_emb
        
        # Causal mask
        T = types.size(1)
        causal_mask = torch.triu(torch.ones(T, T), diagonal=1).bool().to(types.device)
        
        # Transformer
        hidden = self.transformer(combined, mask=causal_mask)
        
        # Predictions
        type_logits = self.type_head(hidden)
        mdn_params = self.mdn_head(hidden).view(types.size(0), T, self.n_components, 8)  # [B,T,n_components,8]
        
        return type_logits, mdn_params

In [12]:
model = ContinuousXTModel(n_components=6)
type_logits, mdn_params = model(
    batch['input_types'][:4],
    batch['input_positions'][:4],
    batch['input_start_mask'][:4]
)
print(f"Type logits: {type_logits.shape}")
print(f"MDN params: {mdn_params.shape}")

Type logits: torch.Size([4, 14, 6])
MDN params: torch.Size([4, 14, 6, 8])


In [13]:
def parse_mdn_params(mdn_params):
    """
    mdn_params: [B, T, 5, 8]
    Returns: dict with weights and distributions
    """
    # Extract components
    weights = torch.nn.functional.softmax(mdn_params[..., 0], dim=-1)  # [B,T,5] - sum to 1
    
    # Start position (μ, σ)
    start_mean_x = torch.sigmoid(mdn_params[..., 1])  # [B,T,5] in [0,1]
    start_mean_y = torch.sigmoid(mdn_params[..., 2])
    start_std = torch.exp(mdn_params[..., 3]).clamp(0.005, 0.1)  # small variance
    
    # End position (μx, μy, σx, σy)
    end_mean_x = torch.sigmoid(mdn_params[..., 4])
    end_mean_y = torch.sigmoid(mdn_params[..., 5])
    end_std_x = torch.exp(mdn_params[..., 6]).clamp(0.01, 0.5)  # wider variance
    end_std_y = torch.exp(mdn_params[..., 7]).clamp(0.01, 0.5)
    
    return {
        'weights': weights,
        'start_mean': torch.stack([start_mean_x, start_mean_y], dim=-1),  # [B,T,5,2]
        'start_std': start_std,  # [B,T,5]
        'end_mean': torch.stack([end_mean_x, end_mean_y], dim=-1),  # [B,T,5,2]
        'end_std': torch.stack([end_std_x, end_std_y], dim=-1)  # [B,T,5,2]
    }

In [14]:
parsed = parse_mdn_params(mdn_params)
for k, v in parsed.items():
    print(f"{k}: {v.shape}")

weights: torch.Size([4, 14, 6])
start_mean: torch.Size([4, 14, 6, 2])
start_std: torch.Size([4, 14, 6])
end_mean: torch.Size([4, 14, 6, 2])
end_std: torch.Size([4, 14, 6, 2])


### Funkcje straty z wagami dla imbalance

**Type loss**: Weighted CrossEntropy, wagi dostosowane do częstości:
- GOAL: 30.0 (rzadkie, bardzo ważne)
- Shot: 10.0 (rzadkie)
- NO_GOAL: 2.0 (częste)
- Pass: 1.0 (bardzo częste, baseline)

**MDN loss**: Negative log-likelihood mieszanki Gaussianów
- Model uczy się przewidywać rozkład pozycji, nie punktowe wartości
- Dla każdego komponentu: waga × prawdopodobieństwo pod rozkładem
- Maskujemy pozycje dla GOAL/NO_GOAL (nie mają współrzędnych)

In [15]:
WEIGHT_CONFIG = {
    "START": 1.0,
    "Pass": 1.0,
    "Shot": 5.0,
    "GOAL": 15.0,
    "NO_GOAL": 1.0,
    "<pad>": 1.0
}


def type_loss(type_logits, target_types):
    # Wagi z config
    weights = torch.tensor([
        WEIGHT_CONFIG["START"],
        WEIGHT_CONFIG["Pass"],
        WEIGHT_CONFIG["Shot"],
        WEIGHT_CONFIG["GOAL"],
        WEIGHT_CONFIG["NO_GOAL"],
        WEIGHT_CONFIG["<pad>"]
    ]).to(type_logits.device)
    
    return F.cross_entropy(
        type_logits.reshape(-1, 6),
        target_types.reshape(-1),
        weight=weights,
        ignore_index=-100
    )

In [16]:
print(type_vocab)
print(id_to_type)

{'Pass': 0, 'Shot': 1, 'GOAL': 2, 'NO_GOAL': 3, '<pad>': 4}
{0: 'Pass', 1: 'Shot', 2: 'GOAL', 3: 'NO_GOAL', 4: '<pad>'}


In [17]:
def gaussian_nll(target, mean, std):
    """Negative log-likelihood of Gaussian"""
    variance = std ** 2
    return 0.5 * (torch.log(2 * torch.pi * variance) + ((target - mean) ** 2) / variance)

def mdn_loss(mdn_params, target_positions, target_start_mask, target_end_mask):
    """
    mdn_params: [B, T, n_components, 8]
    target_positions: [B, T, 4] - [start_x, start_y, end_x, end_y]
    """
    parsed = parse_mdn_params(mdn_params)
    B, T, n_components = mdn_params.shape[:3]  # Dynamicznie pobierz n_components
    
    # Target positions
    target_start = target_positions[..., :2]  # [B,T,2]
    target_end = target_positions[..., 2:]    # [B,T,2]
    
    # Compute NLL for each component
    component_nll = []
    for k in range(n_components):  # 5 → n_components
        # Start NLL
        start_nll = gaussian_nll(
            target_start.unsqueeze(2),
            parsed['start_mean'][:, :, k:k+1, :],
            parsed['start_std'][:, :, k:k+1].unsqueeze(-1)
        ).sum(dim=-1)
        
        # End NLL
        end_nll = gaussian_nll(
            target_end.unsqueeze(2),
            parsed['end_mean'][:, :, k:k+1, :],
            parsed['end_std'][:, :, k:k+1, :]
        ).sum(dim=-1)
        
        component_nll.append(start_nll + end_nll)
    
    component_nll = torch.cat(component_nll, dim=-1)  # [B,T,n_components]
    
    # Mixture NLL
    log_weights = torch.log(parsed['weights'] + 1e-8)
    mixture_nll = -torch.logsumexp(log_weights - component_nll, dim=-1)
    
    # Mask
    mask = target_start_mask.float()
    return (mixture_nll * mask).sum() / mask.sum()

In [18]:
type_l = type_loss(type_logits, batch['target_types'][:4])
mdn_l = mdn_loss(mdn_params, batch['target_positions'][:4], 
                 batch['target_start_mask'][:4], batch['target_end_mask'][:4])
print(f"Type loss: {type_l.item():.4f}")
print(f"MDN loss: {mdn_l.item():.4f}")

Type loss: 2.3751
MDN loss: 0.9919


In [19]:
# Combined loss
def combined_loss(model, batch):
    type_logits, mdn_params = model(
        batch['input_types'],
        batch['input_positions'],
        batch['input_start_mask']
    )
    
    t_loss = type_loss(type_logits, batch['target_types'])
    m_loss = mdn_loss(
        mdn_params, 
        batch['target_positions'],
        batch['target_start_mask'],
        batch['target_end_mask']
    )
    
    return t_loss + m_loss, t_loss, m_loss

### Trening modelu

Hiperparametry:
- **Optimizer**: AdamW (lr=1e-4, weight_decay=0.01)
- **Batch size**: 32
- **Epochs**: 10
- **Gradient clipping**: 1.0 (stabilizacja)
- **Loss**: type_loss + mdn_loss (suma)

Model trenowany na ~14.6k sekwencji (85% zbioru), walidacja na ~2.6k (15%).

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ContinuousXTModel(n_components=5).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)

epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0
    
    for batch_idx, batch in enumerate(train_loader):
        # Move to device
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Forward
        loss, t_loss, m_loss = combined_loss(model, batch)
        
        # Backward
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        total_loss += loss.item()
        
        if batch_idx % 100 == 0:
            print(f"Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item():.4f} (type: {t_loss.item():.3f}, mdn: {m_loss.item():.3f})")
    
    print(f"Epoch {epoch+1} avg loss: {total_loss/len(train_loader):.4f}")

Epoch 1, Batch 0, Loss: 5.4809 (type: 1.499, mdn: 3.982)
Epoch 1, Batch 100, Loss: 0.2183 (type: 0.556, mdn: -0.337)
Epoch 1, Batch 200, Loss: -1.5495 (type: 0.493, mdn: -2.042)
Epoch 1, Batch 300, Loss: -1.8716 (type: 0.624, mdn: -2.495)
Epoch 1, Batch 400, Loss: -2.1230 (type: 0.537, mdn: -2.660)
Epoch 1, Batch 500, Loss: -2.5854 (type: 0.466, mdn: -3.051)
Epoch 1, Batch 600, Loss: -2.2863 (type: 0.512, mdn: -2.798)
Epoch 1, Batch 700, Loss: -2.6474 (type: 0.489, mdn: -3.136)
Epoch 1, Batch 800, Loss: -2.5677 (type: 0.445, mdn: -3.013)
Epoch 1, Batch 900, Loss: -2.5586 (type: 0.577, mdn: -3.136)
Epoch 1, Batch 1000, Loss: -2.4705 (type: 0.543, mdn: -3.013)
Epoch 1, Batch 1100, Loss: -2.6077 (type: 0.556, mdn: -3.164)
Epoch 1, Batch 1200, Loss: -2.7489 (type: 0.484, mdn: -3.233)
Epoch 1, Batch 1300, Loss: -2.7096 (type: 0.444, mdn: -3.154)
Epoch 1, Batch 1400, Loss: -2.5864 (type: 0.501, mdn: -3.088)
Epoch 1, Batch 1500, Loss: -2.7270 (type: 0.498, mdn: -3.225)
Epoch 1, Batch 1600, Lo

In [21]:
model.eval()
val_losses = []
with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        loss, t_loss, m_loss = combined_loss(model, batch)
        val_losses.append(loss.item())

print(f"Validation loss: {sum(val_losses)/len(val_losses):.4f}")

Validation loss: -2.9886


In [22]:
model.eval()
batch = next(iter(val_loader))
batch = {k: v.to(device) for k, v in batch.items()}

with torch.no_grad():
    type_logits, mdn_params = model(
        batch['input_types'],
        batch['input_positions'],
        batch['input_start_mask']
    )
    
    parsed = parse_mdn_params(mdn_params)
    
    # Sprawdź zakresy
    print(f"Weights sum: {parsed['weights'][0,0].sum()}")  # powinno być 1.0
    print(f"Start std range: {parsed['start_std'].min():.4f} - {parsed['start_std'].max():.4f}")
    print(f"End std range: {parsed['end_std'].min():.4f} - {parsed['end_std'].max():.4f}")
    print(f"\nSample component probabilities: {parsed['weights'][0,0]}")

Weights sum: 1.0
Start std range: 0.1000 - 0.1000
End std range: 0.0100 - 0.5000

Sample component probabilities: tensor([0.2687, 0.0605, 0.0011, 0.0967, 0.5730], device='cuda:0')


In [23]:
batch = next(iter(val_loader))
batch = {k: v.to(device) for k, v in batch.items()}

with torch.no_grad():
    type_logits, mdn_params = model(
        batch['input_types'],
        batch['input_positions'],
        batch['input_start_mask']
    )

pred_types = type_logits.argmax(dim=-1)
mask = batch['target_types'] != -100

for type_id in [1, 2, 3, 4, 5]:
    type_mask = (batch['target_types'] == type_id) & mask
    if type_mask.sum() > 0:
        acc = (pred_types[type_mask] == type_id).float().mean()
        print(f"{id_to_type[type_id]}: {acc:.1%} ({type_mask.sum()} samples)")

Shot: 0.0% (3 samples)
GOAL: 100.0% (1 samples)
NO_GOAL: 100.0% (31 samples)


### Wyliczanie xT przez Monte Carlo rollouts

xT obliczany jako prawdopodobieństwo gola poprzez symulacje kontynuacji akcji:
1. Startujemy z początkiem sekwencji (np. pierwsze 3 eventy)
2. Generujemy N=100 równoległych rolloutów (max 10 kroków każdy)
3. W każdym kroku:
   - Sampilujemy typ następnego eventu z rozkładu modelu
   - Sampilujemy pozycje z wybranego komponentu MDN
   - Kończymy gdy trafiony GOAL lub NO_GOAL
4. xT = odsetek rolloutów kończących się GOAL

To podejście daje probabilistyczną ocenę zagrożenia uwzględniającą niepewność modelu.

In [24]:
def calculate_xT_montecarlo(model, start_sequence, n_rollouts=100, max_steps=10, device='cuda'):
    model.eval()
    
    with torch.no_grad():
        # Initialize N parallel sequences
        seq_types = start_sequence['types'].unsqueeze(0).repeat(n_rollouts, 1)  # [N, T]
        seq_positions = start_sequence['positions'].unsqueeze(0).repeat(n_rollouts, 1, 1)
        seq_start_mask = start_sequence['start_mask'].unsqueeze(0).repeat(n_rollouts, 1)
        
        active = torch.ones(n_rollouts, dtype=torch.bool, device=device)  # które wciąż trwają
        goals = torch.zeros(n_rollouts, dtype=torch.bool, device=device)  # które skończyły GOAL
        
        for step in range(max_steps):
            if not active.any():
                break
            
            # Forward pass dla aktywnych
            type_logits, mdn_params = model(seq_types, seq_positions, seq_start_mask)
            
            # Sample types
            last_logits = type_logits[:, -1, :]  # [N, vocab_size]
            type_probs = F.softmax(last_logits, dim=-1)
            next_types = torch.multinomial(type_probs, 1).squeeze(-1)  # [N]
            
            # Check terminals
            is_goal = (next_types == type_vocab['GOAL']) & active
            is_no_goal = (next_types == type_vocab['NO_GOAL']) & active
            
            goals |= is_goal
            active &= ~(is_goal | is_no_goal)
            
            if not active.any():
                break
            
            # Sample positions dla aktywnych Pass/Shot
            parsed = parse_mdn_params(mdn_params[:, -1:])  # last timestep
            
            # Choose components
            weights = parsed['weights'][:, 0, :]  # [N, n_components]
            k = torch.multinomial(weights, 1).squeeze(-1)  # [N] - wybrany komponent
            
            # Gather selected component params
            batch_idx = torch.arange(n_rollouts, device=device)
            start_mean = parsed['start_mean'][batch_idx, 0, k]  # [N, 2]
            start_std = parsed['start_std'][batch_idx, 0, k]    # [N]
            end_mean = parsed['end_mean'][batch_idx, 0, k]      # [N, 2]
            end_std = parsed['end_std'][batch_idx, 0, k]        # [N, 2]
            
            # Sample positions
            start_xy = (start_mean + torch.randn_like(start_mean) * start_std.unsqueeze(-1)).clamp(0, 1)
            end_xy = (end_mean + torch.randn_like(end_mean) * end_std).clamp(0, 1)
            
            # Set end to 0 for Shot
            is_shot = (next_types == type_vocab['Shot'])
            end_xy[is_shot] = 0.0
            
            # Append
            new_pos = torch.cat([start_xy, end_xy], dim=-1).unsqueeze(1)  # [N, 1, 4]
            new_types = next_types.unsqueeze(1)  # [N, 1]
            new_mask = torch.ones(n_rollouts, 1, dtype=torch.bool, device=device)
            
            seq_types = torch.cat([seq_types, new_types], dim=1)
            seq_positions = torch.cat([seq_positions, new_pos], dim=1)
            seq_start_mask = torch.cat([seq_start_mask, new_mask], dim=1)
            
            if seq_types.size(1) >= 14:
                break
    
    return goals.float().mean().item()

In [25]:
def prepare_start_sequence(sample, device):
    """
    Przygotuj początkową sekwencję do Monte Carlo
    - Seq len 1-2: weź wszystko
    - Seq len >2: weź pierwsze 3
    """
    # Znajdź rzeczywistą długość (bez paddingu)
    real_len = (sample['input_types'] != type_vocab['<pad>']).sum().item()
    
    if real_len <= 2:
        # Weź wszystko oprócz ostatniego
        start_len = real_len
    else:
        # Weź pierwsze 3
        start_len = 3
    
    return {
        'types': sample['input_types'][:start_len].to(device),
        'positions': sample['input_positions'][:start_len].to(device),
        'start_mask': sample['input_start_mask'][:start_len].to(device)
    }

### Ewaluacja modelu

Metryki probabilistyczne (właściwe dla xT):
- **ROC-AUC**: zdolność rozróżnienia akcji bramkowych vs niebramkowych
- **Brier Score**: kalibracja prawdopodobieństw (niższy = lepiej)
- **Separacja**: różnica średniego xT dla goli vs nie-goli

Każda sekwencja z validation set otrzymuje xT przez 100 rolloutów Monte Carlo. Unikamy accuracy/F1 bo wczesne eventy w akcjach bramkowych *powinny* mieć niskie xT.

In [26]:
val_xTs = []
val_labels = []

for i in range(len(val_dataset)):
    sample = val_dataset[i]
    start_seq = prepare_start_sequence(sample, device)
    
    xT = calculate_xT_montecarlo(model, start_seq, n_rollouts=100)
    val_xTs.append(xT)
    
    # True label (czy akcja skończyła się golem)
    label = (sample['target_types'] == type_vocab['GOAL']).any().item()
    val_labels.append(label)
    
    if (i+1) % 100 == 0:
        print(f"Processed {i+1}/{len(val_dataset)}")

val_xTs = np.array(val_xTs)
val_labels = np.array(val_labels)

# Metrics
from sklearn.metrics import roc_auc_score, brier_score_loss

roc_auc = roc_auc_score(val_labels, val_xTs)
brier = brier_score_loss(val_labels, val_xTs)

print(f"\nROC-AUC: {roc_auc:.3f}")
print(f"Brier Score: {brier:.3f}")
print(f"Mean xT (goals): {val_xTs[val_labels==1].mean():.3f}")
print(f"Mean xT (no goals): {val_xTs[val_labels==0].mean():.3f}")

Processed 100/51219
Processed 200/51219
Processed 300/51219
Processed 400/51219
Processed 500/51219
Processed 600/51219
Processed 700/51219
Processed 800/51219
Processed 900/51219
Processed 1000/51219
Processed 1100/51219
Processed 1200/51219
Processed 1300/51219
Processed 1400/51219
Processed 1500/51219
Processed 1600/51219
Processed 1700/51219
Processed 1800/51219
Processed 1900/51219
Processed 2000/51219
Processed 2100/51219
Processed 2200/51219
Processed 2300/51219
Processed 2400/51219
Processed 2500/51219
Processed 2600/51219
Processed 2700/51219
Processed 2800/51219
Processed 2900/51219
Processed 3000/51219
Processed 3100/51219
Processed 3200/51219
Processed 3300/51219
Processed 3400/51219
Processed 3500/51219
Processed 3600/51219
Processed 3700/51219
Processed 3800/51219
Processed 3900/51219
Processed 4000/51219
Processed 4100/51219
Processed 4200/51219
Processed 4300/51219
Processed 4400/51219
Processed 4500/51219
Processed 4600/51219
Processed 4700/51219
Processed 4800/51219
P

In [27]:
# Weź sekwencję kończącą się GOAL
goal_sample = None
for i in range(len(val_dataset)):
    if (val_dataset[i]['target_types'] == type_vocab['GOAL']).any():
        goal_sample = val_dataset[i]
        break

print("Input types:", goal_sample['input_types'][:5])
print("Target types:", goal_sample['target_types'][:5])
print(f"\nGOAL in input? {(goal_sample['input_types'] == type_vocab['GOAL']).any()}")
print(f"GOAL in target? {(goal_sample['target_types'] == type_vocab['GOAL']).any()}")

Input types: tensor([0, 0, 0, 0, 0])
Target types: tensor([0, 0, 0, 0, 0])

GOAL in input? False
GOAL in target? True


In [28]:
import mlflow
mlflow.set_experiment("xT_MDN_Model")

with mlflow.start_run(run_name="MDN_Model"):
    # Params
    mlflow.log_param("model", "Transformer_MDN")
    mlflow.log_param("epochs", 10)
    mlflow.log_param("batch_size", 32)
    mlflow.log_param("lr", 1e-4)
    mlflow.log_param("d_model", 512)
    mlflow.log_param("num_layers", 12)
    mlflow.log_param("n_components", 5)
    mlflow.log_param("loss_weights", str(WEIGHT_CONFIG))
    mlflow.log_param("uwagi", 'top5')

    
    # Metrics
    mlflow.log_metric("roc_auc", roc_auc)
    mlflow.log_metric("brier_score", brier)
    mlflow.log_metric("mean_xT_goals", val_xTs[val_labels==1].mean())
    mlflow.log_metric("mean_xT_no_goals", val_xTs[val_labels==0].mean())
    mlflow.log_metric("separation", val_xTs[val_labels==1].mean() - val_xTs[val_labels==0].mean())
    
    # Model
    mlflow.pytorch.log_model(model, "model")
    
    print("✅ Logged to MLflow")

  return FileStore(store_uri, store_uri)


✅ Logged to MLflow


In [29]:
def visualize_mdn_heatmap(model, input_seq, device='cuda'):
    model.eval()
    
    with torch.no_grad():
        seq = {k: v.unsqueeze(0).to(device) for k, v in input_seq.items()}
        type_logits, mdn_params = model(seq['types'], seq['positions'], seq['start_mask'])
        
        parsed = parse_mdn_params(mdn_params[:, -1:])
        
        # Grid
        x = np.linspace(0, 1, 120)
        y = np.linspace(0, 1, 80)
        X, Y = np.meshgrid(x, y)
        Z = np.zeros_like(X)
        
        for k in range(5):
            weight = parsed['weights'][0, 0, k].cpu().numpy()
            mean = parsed['start_mean'][0, 0, k].cpu().numpy()
            std = parsed['start_std'][0, 0, k].cpu().numpy()
            Z += weight * np.exp(-((X - mean[0])**2 + (Y - mean[1])**2) / (2 * std**2))
        
        # Plot
        fig, ax = plt.subplots(figsize=(12, 8))
        im = ax.imshow(Z, origin='lower', extent=[0, 120, 0, 80], cmap='hot', aspect='auto')
        ax.set_xlabel('X (m)')
        ax.set_ylabel('Y (m)')
        ax.set_title('MDN Predicted Start Position Distribution')
        plt.colorbar(im, label='Probability Density')
        
        # Komponenty (niebieskie X)
        for k in range(5):
            mean = parsed['start_mean'][0, 0, k].cpu().numpy()
            ax.scatter(mean[0]*120, mean[1]*80, c='blue', s=100, marker='x', linewidths=3, label='Component' if k==0 else '')
        
        # AKTUALNA POZYCJA (koniec ostatniego eventu)
        last_pos = input_seq['positions'][-1].cpu().numpy()  # [4]: [start_x, start_y, end_x, end_y]
        current_x, current_y = last_pos[2], last_pos[3]  # end position
        ax.scatter(current_x*120, current_y*80, c='green', s=200, marker='o', edgecolors='black', linewidths=2, label='Current position', zorder=10)
        
        ax.legend()
        plt.tight_layout()
        return fig

