In [39]:
import pandas as pd
import numpy as np

from rl_lib.swiss_round.environment import SwissRoundEnv
from rl_lib.swiss_round.agent import DQNAgent

%reload_ext autoreload
%autoreload 2

In [40]:
n_teams = 18
n_rounds = 6
team_strengths = [1 * 0.9 ** i for i in range(n_teams)]
threshold_ranks = [4,12]
bonus_points = [20,20]
agent_id = threshold_ranks[-1] #Agent_id just below last threshold

env = SwissRoundEnv(
    n_teams=n_teams,
    n_rounds=n_rounds,
    team_strengths=team_strengths,
    threshold_ranks=threshold_ranks,
    bonus_points=bonus_points,
    agent_id=agent_id,
    max_draw_probability=0.3
)

### RL Agent

In [41]:
agent = DQNAgent(env,
                 hidden_size=256,
                 buffer_size=100000,
                 epsilon_decay=0.999)
agent.train(n_episodes=10000)

Episode 100/10000, Avg Reward: 3.80, Epsilon: 0.576, (failed episodes: 4)
Episode 200/10000, Avg Reward: 4.40, Epsilon: 0.313, (failed episodes: 6)
Episode 300/10000, Avg Reward: 7.40, Epsilon: 0.171, (failed episodes: 7)
Episode 400/10000, Avg Reward: 9.00, Epsilon: 0.094, (failed episodes: 7)
Episode 500/10000, Avg Reward: 9.00, Epsilon: 0.051, (failed episodes: 7)
Episode 600/10000, Avg Reward: 11.00, Epsilon: 0.028, (failed episodes: 7)
Episode 700/10000, Avg Reward: 8.80, Epsilon: 0.015, (failed episodes: 7)
Episode 800/10000, Avg Reward: 10.80, Epsilon: 0.010, (failed episodes: 8)
Episode 900/10000, Avg Reward: 8.80, Epsilon: 0.010, (failed episodes: 10)
Episode 1000/10000, Avg Reward: 10.20, Epsilon: 0.010, (failed episodes: 11)
Episode 1100/10000, Avg Reward: 9.60, Epsilon: 0.010, (failed episodes: 11)
Episode 1200/10000, Avg Reward: 11.40, Epsilon: 0.010, (failed episodes: 11)
Episode 1300/10000, Avg Reward: 9.40, Epsilon: 0.010, (failed episodes: 11)
Episode 1400/10000, Avg R

### Baselines Simulations

In [None]:
simulation_wa = env.simulate_n_tournaments(2000,24, policy = 'win_all',display_results=False)
baseline_reward_wa = simulation_wa.loc[agent_id,'Avg_Points'] + sum([b * simulation_wa.loc[agent_id,f"Top-{t} %"] for b,t in zip(
    bonus_points, threshold_ranks
)])
print(f"Basline WinAll average reward = {baseline_reward_wa:.1f}")
simulation_wa

In [None]:
simulation_lf = env.simulate_n_tournaments(2000,24, policy = 'win_all',display_results=False)
baseline_reward_lf = simulation_lf.loc[agent_id,'Avg_Points'] + sum([b * simulation_lf.loc[agent_id,f"Top-{t} %"] for b,t in zip(
    bonus_points, threshold_ranks
)])
print(f"Basline LoseFirst average reward = {baseline_reward_lf:.1f}")
simulation_lf

In [None]:
# Simulate tournament
final_standings = env.simulate_tournament(verbose= True)

print("\nFinal standings (team_id, points, opponent_average):")
for rank, (team_id, points, opp_avg,strength) in enumerate(final_standings, 1):

    print(f"Rank {rank}: Team {team_id} - Strength {strength:.2f} - Points: {points} - Opponent Avg: {opp_avg:.2f}")
