In [9]:
import pandas as pd
import numpy as np

from rl_lib.swiss_round.environment import SwissRoundEnv
from rl_lib.swiss_round.agent import DQNAgent

%reload_ext autoreload
%autoreload 2

# Utils 

In [15]:
def probability_tables(team_strengths, max_draw_probability):
    index = range(len(team_strengths))
    wps = []
    dps = []
    lps = []
    for ts1 in team_strengths :
        twps = []
        tdps = []
        tlps = []
        for ts2 in team_strengths :
            strength_diff = ts1 - ts2
            tmp_win_prob = 1 / (1 + np.exp(-strength_diff))
            tmp_loss_prob = 1 / (1 + np.exp(+strength_diff))
            tmp_draw_prob = max_draw_probability * np.exp(-abs(strength_diff))
            # Softmax
            win_prob = tmp_win_prob / (tmp_win_prob + tmp_draw_prob + tmp_loss_prob)
            draw_prob = tmp_draw_prob / (tmp_win_prob + tmp_draw_prob + tmp_loss_prob)
            loss_prob = tmp_loss_prob / (tmp_win_prob + tmp_draw_prob + tmp_loss_prob)  
            
            twps.append(win_prob)
            tdps.append(draw_prob)
            tlps.append(loss_prob)
        wps.append(twps)
        dps.append(tdps)
        lps.append(tlps)
    return pd.DataFrame(wps, index=index, columns = index), pd.DataFrame(dps, index=index, columns = index),pd.DataFrame(lps, index=index, columns = index)        

In [23]:
def check_probability(team_strengths, max_draw_probability):
    wp, dp, lp = probability_tables(team_strengths=team_strengths, max_draw_probability=max_draw_probability)
    df = wp+dp+lp
    values_array = df.to_numpy()
    target_array = np.full_like(values_array, 1)

    return np.allclose(values_array, target_array, rtol=10e-5, atol=10e-8)

# Environement

In [62]:
n_teams = 18
n_rounds = 6
mdp = 0.3
team_strengths = [2 - 0.3*i for i in range(n_teams)]
threshold_ranks = [4,12]
bonus_points = [20,20]
agent_id = threshold_ranks[-1] #Agent_id just below last threshold
n_baselines_simu = 2000

print(f"Valid probability set-up : {check_probability(team_strengths, mdp)}")

env = SwissRoundEnv(
    n_teams=n_teams,
    n_rounds=n_rounds,
    team_strengths=team_strengths,
    threshold_ranks=threshold_ranks,
    bonus_points=bonus_points,
    agent_id=agent_id,
    max_draw_probability=mdp
)

Valid probability set-up : True


### Probability table

In [63]:
np.array(team_strengths).round(2)

array([ 2. ,  1.7,  1.4,  1.1,  0.8,  0.5,  0.2, -0.1, -0.4, -0.7, -1. ,
       -1.3, -1.6, -1.9, -2.2, -2.5, -2.8, -3.1])

In [64]:
wp, dp, lp = probability_tables(team_strengths=team_strengths, max_draw_probability=mdp)
pd.concat([wp.loc[[agent_id]].rename(index={agent_id:'Win'}),
           dp.loc[[agent_id]].rename(index={agent_id:'Draw'}),
           lp.loc[[agent_id]].rename(index={agent_id:'Loss'}),
           ]).T.round(2)

Unnamed: 0,Win,Draw,Loss
0,0.03,0.01,0.97
1,0.04,0.01,0.95
2,0.05,0.01,0.94
3,0.06,0.02,0.92
4,0.08,0.03,0.89
5,0.11,0.04,0.86
6,0.14,0.05,0.82
7,0.17,0.06,0.77
8,0.21,0.08,0.7
9,0.26,0.11,0.63


#### Detailed tables

In [65]:
wp.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.38,0.47,0.55,0.63,0.7,0.77,0.82,0.86,0.89,0.92,0.94,0.95,0.97,0.97,0.98,0.99,0.99,0.99
1,0.35,0.38,0.47,0.55,0.63,0.7,0.77,0.82,0.86,0.89,0.92,0.94,0.95,0.97,0.97,0.98,0.99,0.99
2,0.3,0.35,0.38,0.47,0.55,0.63,0.7,0.77,0.82,0.86,0.89,0.92,0.94,0.95,0.97,0.97,0.98,0.99
3,0.26,0.3,0.35,0.38,0.47,0.55,0.63,0.7,0.77,0.82,0.86,0.89,0.92,0.94,0.95,0.97,0.97,0.98
4,0.21,0.26,0.3,0.35,0.38,0.47,0.55,0.63,0.7,0.77,0.82,0.86,0.89,0.92,0.94,0.95,0.97,0.97
5,0.17,0.21,0.26,0.3,0.35,0.38,0.47,0.55,0.63,0.7,0.77,0.82,0.86,0.89,0.92,0.94,0.95,0.97
6,0.14,0.17,0.21,0.26,0.3,0.35,0.38,0.47,0.55,0.63,0.7,0.77,0.82,0.86,0.89,0.92,0.94,0.95
7,0.11,0.14,0.17,0.21,0.26,0.3,0.35,0.38,0.47,0.55,0.63,0.7,0.77,0.82,0.86,0.89,0.92,0.94
8,0.08,0.11,0.14,0.17,0.21,0.26,0.3,0.35,0.38,0.47,0.55,0.63,0.7,0.77,0.82,0.86,0.89,0.92
9,0.06,0.08,0.11,0.14,0.17,0.21,0.26,0.3,0.35,0.38,0.47,0.55,0.63,0.7,0.77,0.82,0.86,0.89


In [42]:
dp.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.23,0.12,0.07,0.04,0.03,0.02,0.02,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
1,0.12,0.23,0.14,0.09,0.06,0.04,0.03,0.03,0.02,0.02,0.02,0.02,0.02,0.02,0.01,0.01,0.01,0.01
2,0.07,0.14,0.23,0.15,0.11,0.08,0.06,0.05,0.04,0.04,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.02
3,0.04,0.09,0.15,0.23,0.17,0.13,0.1,0.08,0.07,0.06,0.06,0.05,0.05,0.05,0.04,0.04,0.04,0.04
4,0.03,0.06,0.11,0.17,0.23,0.18,0.14,0.12,0.1,0.09,0.08,0.08,0.07,0.07,0.06,0.06,0.06,0.06
5,0.02,0.04,0.08,0.13,0.18,0.23,0.19,0.16,0.14,0.12,0.11,0.1,0.1,0.09,0.09,0.09,0.08,0.08
6,0.02,0.03,0.06,0.1,0.14,0.19,0.23,0.2,0.17,0.15,0.14,0.13,0.12,0.12,0.11,0.11,0.11,0.1
7,0.01,0.03,0.05,0.08,0.12,0.16,0.2,0.23,0.2,0.18,0.17,0.15,0.15,0.14,0.13,0.13,0.13,0.12
8,0.01,0.02,0.04,0.07,0.1,0.14,0.17,0.2,0.23,0.21,0.19,0.18,0.17,0.16,0.15,0.15,0.15,0.14
9,0.01,0.02,0.04,0.06,0.09,0.12,0.15,0.18,0.21,0.23,0.21,0.2,0.19,0.18,0.17,0.17,0.16,0.16


In [43]:
lp.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.38,0.27,0.18,0.12,0.08,0.06,0.05,0.04,0.03,0.03,0.03,0.02,0.02,0.02,0.02,0.02,0.02,0.02
1,0.61,0.38,0.3,0.22,0.16,0.13,0.1,0.08,0.07,0.06,0.06,0.05,0.05,0.05,0.05,0.04,0.04,0.04
2,0.75,0.57,0.38,0.32,0.25,0.21,0.17,0.14,0.13,0.11,0.1,0.1,0.09,0.09,0.08,0.08,0.08,0.08
3,0.84,0.69,0.53,0.38,0.33,0.28,0.24,0.21,0.19,0.17,0.16,0.15,0.14,0.13,0.13,0.12,0.12,0.12
4,0.89,0.78,0.64,0.5,0.38,0.34,0.31,0.27,0.25,0.23,0.21,0.2,0.19,0.18,0.18,0.17,0.17,0.16
5,0.92,0.83,0.72,0.59,0.48,0.38,0.35,0.32,0.3,0.28,0.26,0.25,0.24,0.23,0.22,0.22,0.21,0.21
6,0.94,0.87,0.77,0.66,0.55,0.46,0.38,0.36,0.34,0.32,0.3,0.29,0.28,0.27,0.26,0.26,0.25,0.25
7,0.95,0.89,0.81,0.71,0.61,0.52,0.44,0.38,0.37,0.35,0.33,0.32,0.31,0.3,0.29,0.29,0.28,0.28
8,0.96,0.9,0.83,0.74,0.65,0.57,0.49,0.43,0.38,0.37,0.36,0.34,0.33,0.33,0.32,0.31,0.31,0.31
9,0.96,0.92,0.85,0.77,0.68,0.6,0.53,0.47,0.42,0.38,0.37,0.36,0.35,0.35,0.34,0.33,0.33,0.33


In [22]:
wp+dp+lp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Simulations

### Baselines Simulations

In [66]:
simulation_wa = env.simulate_n_tournaments(n_baselines_simu,
                                           n_cores = 32, 
                                           policy = 'win_all',
                                           display_results=False)
baseline_reward_wa = simulation_wa.loc[agent_id,'Avg_Points'] + sum([b * simulation_wa.loc[agent_id,f"Top-{t} %"] for b,t in zip(
    bonus_points, threshold_ranks
)])
print(f"Baseline WinAll average reward = {baseline_reward_wa:.1f}")
simulation_wa.loc[agent_id]

Simulating tournaments: 100%|██████████| 2000/2000 [00:11<00:00, 169.31it/s]

Baseline WinAll average reward = 16.5





Team          12.000000
Strength      -1.600000
Avg_Points     6.838481
Avg_Rank      12.509367
Top-4 %        0.010633
Top-12 %       0.472911
Name: 12, dtype: float64

In [67]:
simulation_lf = env.simulate_n_tournaments(n_baselines_simu,n_cores = 32, policy = 'lose_first',display_results=False)
baseline_reward_lf = simulation_lf.loc[agent_id,'Avg_Points'] + sum([b * simulation_lf.loc[agent_id,f"Top-{t} %"] for b,t in zip(
    bonus_points, threshold_ranks
)])
print(f"Baseline LoseFirst average reward = {baseline_reward_lf:.1f}")
simulation_lf.loc[agent_id]

Simulating tournaments: 100%|██████████| 2000/2000 [00:12<00:00, 158.33it/s]

Baseline LoseFirst average reward = 14.7





Team          12.000000
Strength      -1.600000
Avg_Points     6.567870
Avg_Rank      13.066599
Top-4 %        0.006101
Top-12 %       0.399593
Name: 12, dtype: float64

### RL Agent

In [68]:
print(f"Baselines average reward : WinAll = {baseline_reward_wa:.1f}, LoseFirst = {baseline_reward_lf:.1f}")

Baselines average reward : WinAll = 16.5, LoseFirst = 14.7


In [69]:
agent = DQNAgent(env,
                 hidden_dims=[256,128,64],
                 dropout= 0.1,
                 buffer_size=10000,
                 epsilon_decay=0.9995)
agent.train(n_episodes=6000)

Episode 100/6000 | Avg Reward: 5.44 | Avg nb gambits played 3.78 | Epsilon: 0.761 | Failed episodes: 2
Episode 200/6000 | Avg Reward: 5.68 | Avg nb gambits played 3.54 | Epsilon: 0.563 | Failed episodes: 3
Episode 300/6000 | Avg Reward: 7.98 | Avg nb gambits played 3.18 | Epsilon: 0.417 | Failed episodes: 3
Episode 400/6000 | Avg Reward: 10.41 | Avg nb gambits played 2.69 | Epsilon: 0.309 | Failed episodes: 3
Episode 500/6000 | Avg Reward: 10.75 | Avg nb gambits played 2.31 | Epsilon: 0.229 | Failed episodes: 3
Episode 600/6000 | Avg Reward: 11.69 | Avg nb gambits played 1.67 | Epsilon: 0.169 | Failed episodes: 4
Episode 700/6000 | Avg Reward: 12.54 | Avg nb gambits played 1.54 | Epsilon: 0.125 | Failed episodes: 4
Episode 800/6000 | Avg Reward: 14.00 | Avg nb gambits played 1.33 | Epsilon: 0.093 | Failed episodes: 5
Episode 900/6000 | Avg Reward: 14.88 | Avg nb gambits played 1.02 | Epsilon: 0.068 | Failed episodes: 6
Episode 1000/6000 | Avg Reward: 14.29 | Avg nb gambits played 0.80 

### Verbosed simulation

In [None]:
# Simulate tournament
final_standings = env.simulate_tournament(verbose= True)

print("\nFinal standings (team_id, points, opponent_average):")
for rank, (team_id, points, opp_avg,strength) in enumerate(final_standings, 1):

    print(f"Rank {rank}: Team {team_id} - Strength {strength:.2f} - Points: {points} - Opponent Avg: {opp_avg:.2f}")
