## Run Training Sequence
This notebook ca nbe run all at once to build the automatic transformer model. 
If greater computational resources are available, increase the number of sequences in the training loop and final training for a stronger final model.

In [None]:
%reset -f
import os
import sys
import importlib
import logging
from pathlib import Path
import polars as pl
import torch
import importlib
import traceback
import inspect
import random
from dataclasses import asdict
from typing import Dict, Optional
from torch.utils.data import DataLoader
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Setup project path
proj = Path.cwd()
if (proj / "src").exists():
    root = proj
elif (proj.parent / "src").exists():
    root = proj.parent
else:
    root = next(p for p in [proj, *proj.parents] if (p / "src").exists())

# Set up python path
os.chdir(root)
if str(root) not in sys.path:
    sys.path.insert(0, str(root))
logger.info(f"Project root configured: {root}")

# Verify critical paths exist
for path in ["src", "data", "data/raw", "data/processed", "notebooks"]:
    if not (root / path).exists():
        raise RuntimeError(f"Missing required path: {root / path}")

In [None]:
from src.models.player_dataset import load_processed_data, join_teamframe, build_player_id_map
from src.models.train_player_model import (
    FEATURE_COLS,
    RANDOM_SEED,
    set_seed,
    agentic_search_frames,
    train_final_frame_model,
    predict_frames_on_test,
)

set_seed(RANDOM_SEED)


In [None]:
players_train, players_test, teamframe_train, teamframe_test = load_processed_data()

players_train_joined = join_teamframe(players_train, teamframe_train)
players_test_joined  = join_teamframe(players_test, teamframe_test)

player_id_map = build_player_id_map(players_train_joined)

print("players_train_joined:", players_train_joined.shape)
print("players_test_joined:", players_test_joined.shape)
print("num unique players:", len(player_id_map))


In [None]:
players_train_joined.head()

In [None]:
## double check for nulls
# players_train_joined.select([
#     *[pl.col(c).is_null().sum().alias(f"{c}_nulls") for c in FEATURE_COLS]
# ]).to_pandas().T.head(40)


In [None]:
# Agentic search (frame-based)
best_agentic_frames = agentic_search_frames(players_train_joined, player_id_map, FEATURE_COLS, n_sequences=3000, trials=100)
hist = best_agentic_frames["history"]
hist_df = pd.DataFrame(hist)

display(hist_df)

plt.figure(figsize=(6, 4))
plt.plot(hist_df["trial"], hist_df["cv_rmse"], marker="o")
plt.xlabel("Trial")
plt.ylabel("CV RMSE")
plt.title("Frame-level Agentic Search: CV RMSE by Trial")
plt.grid(True)
plt.tight_layout()
plt.show()

plt.figure(figsize=(6, 4))
plt.scatter(hist_df["model_dim"], hist_df["cv_rmse"])
plt.xlabel("model_dim")
plt.ylabel("CV RMSE")
plt.title("CV RMSE vs model_dim")
plt.grid(True)
plt.tight_layout()
plt.show()

# Train final frame model on a slightly larger subset
frame_model = train_final_frame_model(players_train_joined, player_id_map, best_agentic_frames["cfg"], n_sequences=10000) # make larger if computational resources allow

# Save and predict on test
import os, torch
os.makedirs("models", exist_ok=True)
torch.save(
    {"state_dict": frame_model.state_dict(),
     "cfg": asdict(best_agentic_frames["cfg"]),
     "player_id_map": player_id_map},
    "models/player_physics_transformer_frames_agentic.pt",
)

predict_frames_on_test(
    frame_model,
    players_test_joined,
    player_id_map,
    out_path="models/frame_level_predictions_agentic.csv",
)
