In [1]:
import pandas as pd
import numpy as np

from rl_lib.swiss_round.environment import SwissRoundEnv
from rl_lib.swiss_round.agent import DQNAgent

%reload_ext autoreload
%autoreload 2

In [None]:
n_teams = 18
n_rounds = 6
team_strengths = [1 * 0.5 ** i for i in range(n_teams)]
threshold_ranks = [4,12]
bonus_points = [20,20]
agent_id = threshold_ranks[-1] #Agent_id just below last threshold
n_baselines_simu = 2000

env = SwissRoundEnv(
    n_teams=n_teams,
    n_rounds=n_rounds,
    team_strengths=team_strengths,
    threshold_ranks=threshold_ranks,
    bonus_points=bonus_points,
    agent_id=agent_id,
    max_draw_probability=0.3
)

### Baselines Simulations

In [None]:
simulation_wa = env.simulate_n_tournaments(n_baselines_simu,
                                           n_cores = 32, 
                                           policy = 'win_all',
                                           display_results=False)
baseline_reward_wa = simulation_wa.loc[agent_id,'Avg_Points'] + sum([b * simulation_wa.loc[agent_id,f"Top-{t} %"] for b,t in zip(
    bonus_points, threshold_ranks
)])
print(f"Baseline WinAll average reward = {baseline_reward_wa:.1f}")
simulation_wa.loc[agent_id]

Simulating tournaments: 100%|██████████| 1000/1000 [00:03<00:00, 257.34it/s]


Baseline WinAll average reward = 43.2


Team           0.000000
Strength       1.000000
Avg_Points    11.950655
Avg_Rank       4.561934
Top-4 %        0.622356
Top-12 %       0.939577
Name: 0, dtype: float64

In [None]:
simulation_lf = env.simulate_n_tournaments(n_baselines_simu,n_cores = 32, policy = 'lose_first',display_results=False)
baseline_reward_lf = simulation_lf.loc[agent_id,'Avg_Points'] + sum([b * simulation_lf.loc[agent_id,f"Top-{t} %"] for b,t in zip(
    bonus_points, threshold_ranks
)])
print(f"Baseline LoseFirst average reward = {baseline_reward_lf:.1f}")
simulation_lf.loc[agent_id]

Simulating tournaments: 100%|██████████| 1000/1000 [00:04<00:00, 238.65it/s]


Baseline LoseFirst average reward = 35.0


Team           0.000000
Strength       1.000000
Avg_Points    10.111446
Avg_Rank       6.927711
Top-4 %        0.412651
Top-12 %       0.831325
Name: 0, dtype: float64

### RL Agent

In [5]:
print(f"Baselines average reward : WinAll = {baseline_reward_wa:.1f}, LoseFirst = {baseline_reward_lf:.1f}")

Baselines average reward : WinAll = 43.2, LoseFirst = 35.0


In [None]:
agent = DQNAgent(env,
                 hidden_dims=[256,128,64],
                 dropout= 0.1,
                 buffer_size=10000,
                 epsilon_decay=0.9995)
agent.train(n_episodes=6000)

Episode 100/6000 | Avg Reward: 14.14 | Avg nb gambits played 3.94 | Epsilon: 0.765 | Failed episodes: 0
Episode 200/6000 | Avg Reward: 15.79 | Avg nb gambits played 3.48 | Epsilon: 0.567 | Failed episodes: 0
Episode 300/6000 | Avg Reward: 22.28 | Avg nb gambits played 2.91 | Epsilon: 0.419 | Failed episodes: 0
Episode 400/6000 | Avg Reward: 28.09 | Avg nb gambits played 2.31 | Epsilon: 0.309 | Failed episodes: 0
Episode 500/6000 | Avg Reward: 28.66 | Avg nb gambits played 1.55 | Epsilon: 0.229 | Failed episodes: 0
Episode 600/6000 | Avg Reward: 37.35 | Avg nb gambits played 1.26 | Epsilon: 0.168 | Failed episodes: 0
Episode 700/6000 | Avg Reward: 35.50 | Avg nb gambits played 0.85 | Epsilon: 0.125 | Failed episodes: 0
Episode 800/6000 | Avg Reward: 35.99 | Avg nb gambits played 0.89 | Epsilon: 0.092 | Failed episodes: 0
Episode 900/6000 | Avg Reward: 34.93 | Avg nb gambits played 0.80 | Epsilon: 0.068 | Failed episodes: 0
Episode 1000/6000 | Avg Reward: 41.60 | Avg nb gambits played 0.

### Verbosed simulation

In [None]:
# Simulate tournament
final_standings = env.simulate_tournament(verbose= True)

print("\nFinal standings (team_id, points, opponent_average):")
for rank, (team_id, points, opp_avg,strength) in enumerate(final_standings, 1):

    print(f"Rank {rank}: Team {team_id} - Strength {strength:.2f} - Points: {points} - Opponent Avg: {opp_avg:.2f}")
