In [1]:
import pandas as pd
import numpy as np

from rl_lib.swiss_round.environment import SwissRoundEnv
from rl_lib.swiss_round.agent import DQNAgent

%reload_ext autoreload
%autoreload 2

In [2]:
n_teams = 18
n_rounds = 6
team_strengths = [1 * 0.5 ** i for i in range(n_teams)]
threshold_ranks = [4,12]
bonus_points = [20,20]
agent_id = threshold_ranks[-1] #Agent_id just below last threshold

env = SwissRoundEnv(
    n_teams=n_teams,
    n_rounds=n_rounds,
    team_strengths=team_strengths,
    threshold_ranks=threshold_ranks,
    bonus_points=bonus_points,
    agent_id=agent_id,
    max_draw_probability=0.3
)

### RL Agent

In [5]:
agent = DQNAgent(env,
                 hidden_size=256,
                 buffer_size=100000,
                 epsilon_decay=0.999)
agent.train(n_episodes=10000)

  states = torch.FloatTensor([e.state for e in experiences]).to(self.device)


Episode 100/10000, Avg Reward: 2.60, Epsilon: 0.580, (failed episodes: 2)
Episode 200/10000, Avg Reward: 6.00, Epsilon: 0.318, (failed episodes: 2)
Episode 300/10000, Avg Reward: 7.00, Epsilon: 0.174, (failed episodes: 3)
Episode 400/10000, Avg Reward: 9.20, Epsilon: 0.095, (failed episodes: 4)
Episode 500/10000, Avg Reward: 8.20, Epsilon: 0.052, (failed episodes: 4)
Episode 600/10000, Avg Reward: 9.60, Epsilon: 0.029, (failed episodes: 4)
Episode 700/10000, Avg Reward: 10.60, Epsilon: 0.016, (failed episodes: 4)
Episode 800/10000, Avg Reward: 10.20, Epsilon: 0.010, (failed episodes: 5)
Episode 900/10000, Avg Reward: 11.40, Epsilon: 0.010, (failed episodes: 5)
Episode 1000/10000, Avg Reward: 11.20, Epsilon: 0.010, (failed episodes: 5)
Episode 1100/10000, Avg Reward: 11.60, Epsilon: 0.010, (failed episodes: 6)
Episode 1200/10000, Avg Reward: 10.20, Epsilon: 0.010, (failed episodes: 6)
Episode 1300/10000, Avg Reward: 9.80, Epsilon: 0.010, (failed episodes: 7)
Episode 1400/10000, Avg Rewa

### Baselines Simulations

In [None]:
simulation_wa = env.simulate_n_tournaments(2000,
                                           n_cores = 24, 
                                           policy = 'win_all',
                                           display_results=False)
baseline_reward_wa = simulation_wa.loc[agent_id,'Avg_Points'] + sum([b * simulation_wa.loc[agent_id,f"Top-{t} %"] for b,t in zip(
    bonus_points, threshold_ranks
)])
print(f"Baseline WinAll average reward = {baseline_reward_wa:.1f}")
simulation_wa

Simulating tournaments: 100%|██████████| 2000/2000 [00:07<00:00, 250.26it/s]


Basline WinAll average reward = 23.2


Unnamed: 0,Team,Strength,Avg_Points,Avg_Rank,Top-4 %,Top-12 %
0,0,1.0,12.002015,4.496725,0.647859,0.93199
1,1,0.5,10.033249,6.919395,0.410579,0.8267
2,2,0.25,9.037783,8.422166,0.281612,0.755668
3,3,0.125,8.406549,9.356675,0.215113,0.685642
4,4,0.0625,8.1733,9.75063,0.186902,0.657935
5,5,0.03125,8.080605,9.917884,0.192443,0.633249
6,6,0.015625,7.91335,10.153149,0.159698,0.643325
7,7,0.007812,8.027204,9.98539,0.1733,0.643829
8,8,0.003906,7.961713,10.153149,0.175819,0.624181
9,9,0.001953,7.965239,10.079093,0.186398,0.627708


In [None]:
simulation_lf = env.simulate_n_tournaments(2000,n_cores = 24, policy = 'lose_first',display_results=False)
baseline_reward_lf = simulation_lf.loc[agent_id,'Avg_Points'] + sum([b * simulation_lf.loc[agent_id,f"Top-{t} %"] for b,t in zip(
    bonus_points, threshold_ranks
)])
print(f"Baseline LoseFirst average reward = {baseline_reward_lf:.1f}")
simulation_lf

Simulating tournaments: 100%|██████████| 2000/2000 [00:08<00:00, 242.81it/s]


Basline LoseFirst average reward = 17.6


Unnamed: 0,Team,Strength,Avg_Points,Avg_Rank,Top-4 %,Top-12 %
0,0,1.0,12.038732,4.463783,0.638833,0.937123
1,1,0.5,9.975855,7.034708,0.375252,0.81992
2,2,0.25,8.903924,8.618712,0.267103,0.737928
3,3,0.125,8.340543,9.481891,0.213783,0.683602
4,4,0.0625,8.254024,9.660966,0.209256,0.651408
5,5,0.03125,8.100101,9.858652,0.183099,0.65493
6,6,0.015625,8.15493,9.774145,0.203722,0.650402
7,7,0.007812,8.176056,9.755533,0.203722,0.649396
8,8,0.003906,8.047787,10.003521,0.19165,0.638833
9,9,0.001953,8.003521,10.087022,0.18159,0.627767


In [None]:
# Simulate tournament
final_standings = env.simulate_tournament(verbose= True)

print("\nFinal standings (team_id, points, opponent_average):")
for rank, (team_id, points, opp_avg,strength) in enumerate(final_standings, 1):

    print(f"Rank {rank}: Team {team_id} - Strength {strength:.2f} - Points: {points} - Opponent Avg: {opp_avg:.2f}")
