In [15]:
import pandas as pd
import numpy as np

from rl_lib.swiss_round.environment import SwissRoundEnv
from rl_lib.swiss_round.agent import DQNAgent

%reload_ext autoreload
%autoreload 2

In [16]:
n_teams = 18
n_rounds = 6
team_strengths = [1 * 0.5 ** i for i in range(n_teams)]
threshold_ranks = [4,12]
bonus_points = [20,20]
#agent_id = threshold_ranks[-1] #Agent_id just below last threshold
agent_id=0

env = SwissRoundEnv(
    n_teams=n_teams,
    n_rounds=n_rounds,
    team_strengths=team_strengths,
    threshold_ranks=threshold_ranks,
    bonus_points=bonus_points,
    agent_id=agent_id,
    max_draw_probability=0.3
)

### Baselines Simulations

In [17]:
simulation_wa = env.simulate_n_tournaments(2000,
                                           n_cores = 24, 
                                           policy = 'win_all',
                                           display_results=False)
baseline_reward_wa = simulation_wa.loc[agent_id,'Avg_Points'] + sum([b * simulation_wa.loc[agent_id,f"Top-{t} %"] for b,t in zip(
    bonus_points, threshold_ranks
)])
print(f"Baseline WinAll average reward = {baseline_reward_wa:.1f}")
simulation_wa.loc[agent_id]

Simulating tournaments: 100%|██████████| 2000/2000 [00:07<00:00, 253.76it/s]


Baseline WinAll average reward = 44.0


Team           0.000000
Strength       1.000000
Avg_Points    12.201918
Avg_Rank       4.353357
Top-4 %        0.656739
Top-12 %       0.931853
Name: 0, dtype: float64

In [18]:
simulation_lf = env.simulate_n_tournaments(2000,n_cores = 24, policy = 'lose_first',display_results=False)
baseline_reward_lf = simulation_lf.loc[agent_id,'Avg_Points'] + sum([b * simulation_lf.loc[agent_id,f"Top-{t} %"] for b,t in zip(
    bonus_points, threshold_ranks
)])
print(f"Baseline LoseFirst average reward = {baseline_reward_lf:.1f}")
simulation_lf.loc[agent_id]

Simulating tournaments: 100%|██████████| 2000/2000 [00:08<00:00, 229.48it/s]

Baseline LoseFirst average reward = 35.5





Team           0.000000
Strength       1.000000
Avg_Points    10.242455
Avg_Rank       6.764085
Top-4 %        0.416499
Top-12 %       0.844567
Name: 0, dtype: float64

### RL Agent

In [19]:
print(f"Baselines average reward : WinAll = {baseline_reward_wa:.1f}, LoseFirst = {baseline_reward_lf:.1f}")

Baselines average reward : WinAll = 44.0, LoseFirst = 35.5


In [None]:
agent = DQNAgent(env,
                 hidden_size=256,
                 buffer_size=100000,
                 epsilon_decay=0.9995)
agent.train(n_episodes=4000)

Episode 100/4000 | Avg Reward: 12.11 | Avg nb gambits played 4.18 | Epsilon: 0.995 | Failed episodes: 0
Episode 200/4000 | Avg Reward: 11.85 | Avg nb gambits played 4.07 | Epsilon: 0.989 | Failed episodes: 0
Episode 300/4000 | Avg Reward: 10.69 | Avg nb gambits played 4.08 | Epsilon: 0.983 | Failed episodes: 0
Episode 400/4000 | Avg Reward: 14.12 | Avg nb gambits played 3.89 | Epsilon: 0.977 | Failed episodes: 0
Episode 500/4000 | Avg Reward: 13.96 | Avg nb gambits played 3.95 | Epsilon: 0.971 | Failed episodes: 0
Episode 600/4000 | Avg Reward: 16.67 | Avg nb gambits played 3.70 | Epsilon: 0.965 | Failed episodes: 0
Episode 700/4000 | Avg Reward: 14.40 | Avg nb gambits played 3.72 | Epsilon: 0.959 | Failed episodes: 0
Episode 800/4000 | Avg Reward: 14.01 | Avg nb gambits played 3.86 | Epsilon: 0.953 | Failed episodes: 0
Episode 900/4000 | Avg Reward: 14.58 | Avg nb gambits played 3.95 | Epsilon: 0.948 | Failed episodes: 0
Episode 1000/4000 | Avg Reward: 14.43 | Avg nb gambits played 3.

KeyboardInterrupt: 

### Verbosed simulation

In [None]:
# Simulate tournament
final_standings = env.simulate_tournament(verbose= True)

print("\nFinal standings (team_id, points, opponent_average):")
for rank, (team_id, points, opp_avg,strength) in enumerate(final_standings, 1):

    print(f"Rank {rank}: Team {team_id} - Strength {strength:.2f} - Points: {points} - Opponent Avg: {opp_avg:.2f}")
