In [2]:
import pandas as pd
import numpy as np

from rl_lib.swiss_round.environment import SwissRoundEnv
from rl_lib.swiss_round.agent import DQNAgent

%reload_ext autoreload
%autoreload 2

# Utils 

In [3]:
def probability_tables(team_strengths, max_draw_probability):
    index = range(len(team_strengths))
    wps = []
    dps = []
    lps = []
    for ts1 in team_strengths :
        twps = []
        tdps = []
        tlps = []
        for ts2 in team_strengths :
            strength_diff = ts1 - ts2
            tmp_win_prob = 1 / (1 + np.exp(-strength_diff))
            tmp_loss_prob = 1 / (1 + np.exp(+strength_diff))
            tmp_draw_prob = max_draw_probability * np.exp(-abs(strength_diff))
            # Softmax
            win_prob = tmp_win_prob / (tmp_win_prob + tmp_draw_prob + tmp_loss_prob)
            draw_prob = tmp_draw_prob / (tmp_win_prob + tmp_draw_prob + tmp_loss_prob)
            loss_prob = tmp_loss_prob / (tmp_win_prob + tmp_draw_prob + tmp_loss_prob)  
            
            twps.append(win_prob)
            tdps.append(draw_prob)
            tlps.append(loss_prob)
        wps.append(twps)
        dps.append(tdps)
        lps.append(tlps)
    return pd.DataFrame(wps, index=index, columns = index), pd.DataFrame(dps, index=index, columns = index),pd.DataFrame(lps, index=index, columns = index)        

In [4]:
def check_probability(team_strengths, max_draw_probability):
    wp, dp, lp = probability_tables(team_strengths=team_strengths, max_draw_probability=max_draw_probability)
    df = wp+dp+lp
    values_array = df.to_numpy()
    target_array = np.full_like(values_array, 1)

    return np.allclose(values_array, target_array, rtol=10e-5, atol=10e-8)

# Environement

In [5]:
n_teams = 18
threshold_ranks = [4,12]
agent_id = threshold_ranks[-1] 
team_strengths = np.linspace(4,0,n_teams)
#team_strengths = [4 * 0.9 ** i for i in range(n_teams)]
mdp = 0.5

print(np.array(team_strengths).round(2))
wp, dp, lp = probability_tables(team_strengths=team_strengths, max_draw_probability=mdp)
pd.concat([wp.loc[[agent_id]].rename(index={agent_id:'Win'}),
           dp.loc[[agent_id]].rename(index={agent_id:'Draw'}),
           lp.loc[[agent_id]].rename(index={agent_id:'Loss'}),
           ]).T.round(2)

[4.   3.76 3.53 3.29 3.06 2.82 2.59 2.35 2.12 1.88 1.65 1.41 1.18 0.94
 0.71 0.47 0.24 0.  ]


Unnamed: 0,Win,Draw,Loss
0,0.05,0.03,0.92
1,0.07,0.04,0.9
2,0.08,0.05,0.87
3,0.1,0.06,0.84
4,0.12,0.07,0.81
5,0.15,0.09,0.76
6,0.17,0.11,0.72
7,0.2,0.13,0.66
8,0.23,0.16,0.6
9,0.27,0.2,0.54


In [6]:
n_rounds = 6
bonus_points = [30,20]
#Agent_id just below last threshold
n_baselines_simu = 2000

print(f"Valid probability set-up : {check_probability(team_strengths, mdp)}")

env = SwissRoundEnv(
    n_teams=n_teams,
    n_rounds=n_rounds,
    name='lin_4',
    team_strengths=team_strengths,
    threshold_ranks=threshold_ranks,
    bonus_points=bonus_points,
    agent_id=agent_id,
    max_draw_probability=mdp
)

Valid probability set-up : True


#### Detailed tables

In [7]:
wp.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.33,0.4,0.47,0.54,0.6,0.66,0.72,0.76,0.81,0.84,0.87,0.9,0.92,0.93,0.95,0.96,0.97,0.97
1,0.32,0.33,0.4,0.47,0.54,0.6,0.66,0.72,0.76,0.81,0.84,0.87,0.9,0.92,0.93,0.95,0.96,0.97
2,0.29,0.32,0.33,0.4,0.47,0.54,0.6,0.66,0.72,0.76,0.81,0.84,0.87,0.9,0.92,0.93,0.95,0.96
3,0.27,0.29,0.32,0.33,0.4,0.47,0.54,0.6,0.66,0.72,0.76,0.81,0.84,0.87,0.9,0.92,0.93,0.95
4,0.23,0.27,0.29,0.32,0.33,0.4,0.47,0.54,0.6,0.66,0.72,0.76,0.81,0.84,0.87,0.9,0.92,0.93
5,0.2,0.23,0.27,0.29,0.32,0.33,0.4,0.47,0.54,0.6,0.66,0.72,0.76,0.81,0.84,0.87,0.9,0.92
6,0.17,0.2,0.23,0.27,0.29,0.32,0.33,0.4,0.47,0.54,0.6,0.66,0.72,0.76,0.81,0.84,0.87,0.9
7,0.15,0.17,0.2,0.23,0.27,0.29,0.32,0.33,0.4,0.47,0.54,0.6,0.66,0.72,0.76,0.81,0.84,0.87
8,0.12,0.15,0.17,0.2,0.23,0.27,0.29,0.32,0.33,0.4,0.47,0.54,0.6,0.66,0.72,0.76,0.81,0.84
9,0.1,0.12,0.15,0.17,0.2,0.23,0.27,0.29,0.32,0.33,0.4,0.47,0.54,0.6,0.66,0.72,0.76,0.81


In [8]:
dp.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.33,0.28,0.24,0.2,0.16,0.13,0.11,0.09,0.07,0.06,0.05,0.04,0.03,0.02,0.02,0.01,0.01,0.01
1,0.28,0.33,0.28,0.24,0.2,0.16,0.13,0.11,0.09,0.07,0.06,0.05,0.04,0.03,0.02,0.02,0.01,0.01
2,0.24,0.28,0.33,0.28,0.24,0.2,0.16,0.13,0.11,0.09,0.07,0.06,0.05,0.04,0.03,0.02,0.02,0.01
3,0.2,0.24,0.28,0.33,0.28,0.24,0.2,0.16,0.13,0.11,0.09,0.07,0.06,0.05,0.04,0.03,0.02,0.02
4,0.16,0.2,0.24,0.28,0.33,0.28,0.24,0.2,0.16,0.13,0.11,0.09,0.07,0.06,0.05,0.04,0.03,0.02
5,0.13,0.16,0.2,0.24,0.28,0.33,0.28,0.24,0.2,0.16,0.13,0.11,0.09,0.07,0.06,0.05,0.04,0.03
6,0.11,0.13,0.16,0.2,0.24,0.28,0.33,0.28,0.24,0.2,0.16,0.13,0.11,0.09,0.07,0.06,0.05,0.04
7,0.09,0.11,0.13,0.16,0.2,0.24,0.28,0.33,0.28,0.24,0.2,0.16,0.13,0.11,0.09,0.07,0.06,0.05
8,0.07,0.09,0.11,0.13,0.16,0.2,0.24,0.28,0.33,0.28,0.24,0.2,0.16,0.13,0.11,0.09,0.07,0.06
9,0.06,0.07,0.09,0.11,0.13,0.16,0.2,0.24,0.28,0.33,0.28,0.24,0.2,0.16,0.13,0.11,0.09,0.07


In [9]:
lp.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.33,0.32,0.29,0.27,0.23,0.2,0.17,0.15,0.12,0.1,0.08,0.07,0.05,0.04,0.04,0.03,0.02,0.02
1,0.4,0.33,0.32,0.29,0.27,0.23,0.2,0.17,0.15,0.12,0.1,0.08,0.07,0.05,0.04,0.04,0.03,0.02
2,0.47,0.4,0.33,0.32,0.29,0.27,0.23,0.2,0.17,0.15,0.12,0.1,0.08,0.07,0.05,0.04,0.04,0.03
3,0.54,0.47,0.4,0.33,0.32,0.29,0.27,0.23,0.2,0.17,0.15,0.12,0.1,0.08,0.07,0.05,0.04,0.04
4,0.6,0.54,0.47,0.4,0.33,0.32,0.29,0.27,0.23,0.2,0.17,0.15,0.12,0.1,0.08,0.07,0.05,0.04
5,0.66,0.6,0.54,0.47,0.4,0.33,0.32,0.29,0.27,0.23,0.2,0.17,0.15,0.12,0.1,0.08,0.07,0.05
6,0.72,0.66,0.6,0.54,0.47,0.4,0.33,0.32,0.29,0.27,0.23,0.2,0.17,0.15,0.12,0.1,0.08,0.07
7,0.76,0.72,0.66,0.6,0.54,0.47,0.4,0.33,0.32,0.29,0.27,0.23,0.2,0.17,0.15,0.12,0.1,0.08
8,0.81,0.76,0.72,0.66,0.6,0.54,0.47,0.4,0.33,0.32,0.29,0.27,0.23,0.2,0.17,0.15,0.12,0.1
9,0.84,0.81,0.76,0.72,0.66,0.6,0.54,0.47,0.4,0.33,0.32,0.29,0.27,0.23,0.2,0.17,0.15,0.12


In [10]:
wp+dp+lp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Simulations

### CALMIP agent log

In [11]:
cpdf = pd.read_csv('../logs/calmip_logs.csv', index_col=0)
cpdf[['exp_id', 'env_name', 'n_episodes','lr', 'layers', 'dropout', 'avg_test_rewards',
       'std_test_rewards', 'avg_test_gambits', 'std_test_gambits']]

Unnamed: 0,exp_id,env_name,n_episodes,lr,layers,dropout,avg_test_rewards,std_test_rewards,avg_test_gambits,std_test_gambits
0,cp_old_1,old_lin_9,50000,0.001,256_128_64,0.1,15.646,11.180862,0.619,0.776427
1,cp_old_2,old_lin_9,50000,0.001,512_128_32,0.1,15.9255,11.35002,0.238,0.500356
2,cp_old_3,old_lin_9,36000,0.001,256_128_64,0.1,15.049,11.148255,0.619,0.71403
3,cp_old_4,old_lin_9,36000,0.001,512_128_32,0.1,15.683,11.186845,0.5875,0.748561
4,cp_old_5,old_lin_9,36000,0.001,512_256_128_64_32,0.1,15.081,11.020274,0.6485,0.757593
5,cp_old_6,old_lin_9,36000,0.001,256_128_64,0.2,16.032,11.266542,0.755,0.792449
6,cp_old_7,old_lin_9,36000,0.001,512_128_32,0.2,15.8965,11.287373,0.499,0.674536
7,cp_old_8,old_lin_9,36000,0.001,512_256_128_64_32,0.2,15.494,11.190441,0.7515,0.772494


In [12]:
ldf = pd.read_csv('../logs/exp_logs.csv', index_col=0)
ldf[['exp_id', 'env_name', 'n_episodes','lr', 'layers', 'dropout', 'avg_test_rewards',
       'std_test_rewards', 'avg_test_gambits', 'std_test_gambits']]

Unnamed: 0,exp_id,env_name,n_episodes,lr,layers,dropout,avg_test_rewards,std_test_rewards,avg_test_gambits,std_test_gambits
0,loc_1,lin_4,20000,0.001,256_128_64,0.1,15.5435,12.914028,0.539,0.777482
1,loc_1,lin_4,20000,0.0005,256_128_64,0.3,16.687,13.713462,0.0705,0.26745
2,loc_1,lin_4,32000,0.0003,64_64,0.2,16.7295,13.79791,0.0205,0.148593
3,loc_1,lin_4,32000,0.0003,64_64,0.3,16.5585,13.320795,0.074,0.283768
4,loc_1,lin_4,32000,0.0003,256_128_64,0.2,15.8145,12.670165,0.2875,0.587234


In [31]:
cpdf.columns

Index(['exp_id', 'env_name', 'n_teams', 'n_rounds', 'thresholds', 'bonuses',
       'agent_id', 'strengths', 'n_episodes', 'n_test_episodes', 'lr',
       'n_layers', 'layers', 'dropout', 'batch_size', 'buffer_size', 'gamma',
       'epsilon', 'epsilon_end', 'epsilon_decay', 'avg_test_rewards',
       'std_test_rewards', 'avg_test_gambits', 'std_test_gambits'],
      dtype='object')

### Baselines Simulations

In [37]:
simulation_wa = env.simulate_n_tournaments(n_baselines_simu,
                                           n_cores = 32, 
                                           policy = 'win_all',
                                           display_results=True)
baseline_reward_wa = simulation_wa.loc[agent_id,'Avg_Reward'] 
baseline_std_wa = simulation_wa.loc[agent_id,'Std_Reward'] 
print(f"Baseline WinAll average reward = {baseline_reward_wa:.1f} ± {baseline_std_wa:.1f}")
simulation_wa.loc[agent_id].round(2)

Simulating tournaments: 100%|██████████| 2000/2000 [00:10<00:00, 191.53it/s]



Simulation Results (from 1985 tournaments):
Team | Strength | Avg Points | Avg Rank | Avg Reward | Std Reward | Top-4 % | Top-12 % | 
----------------------------------------------------------------------------------------
   0 |     4.00 |      13.23 |     3.15 |      56.30 |      14.72 |  77.18% |  99.60% | 
   1 |     3.76 |      12.53 |     3.75 |      52.77 |      16.24 |  67.86% |  99.40% | 
   2 |     3.53 |      11.96 |     4.41 |      49.71 |      17.25 |  60.10% |  98.59% | 
   3 |     3.29 |      11.30 |     5.16 |      45.28 |      17.78 |  48.21% |  97.58% | 
   4 |     3.06 |      10.79 |     5.88 |      41.68 |      17.43 |  38.54% |  96.68% | 
   5 |     2.82 |      10.35 |     6.54 |      38.61 |      17.32 |  31.28% |  94.36% | 
   6 |     2.59 |       9.80 |     7.35 |      35.29 |      16.82 |  24.08% |  91.34% | 
   7 |     2.35 |       9.25 |     8.25 |      31.84 |      16.61 |  18.24% |  85.59% | 
   8 |     2.12 |       8.73 |     9.10 |      28.30 |      15.5

Team          12.00
Strength       1.18
Avg_Points     6.77
Avg_Rank      12.38
Avg_Reward    16.97
Std_Reward    13.73
Top-4 %        0.03
Top-12 %       0.47
Name: 12, dtype: float64

In [19]:
simulation_lf = env.simulate_n_tournaments(n_baselines_simu,n_cores = 32, policy = 'lose_first',display_results=False)
baseline_reward_lf = simulation_lf.loc[agent_id,'Avg_Reward'] 
baseline_std_lf = simulation_lf.loc[agent_id,'Std_Reward'] 
print(f"Baseline Lose First average reward = {baseline_reward_lf:.1f} ± {baseline_std_lf:.1f}")
simulation_lf.loc[agent_id].round(2)

Simulating tournaments: 100%|██████████| 2000/2000 [00:10<00:00, 197.45it/s]

Baseline Lose First average reward = 14.2 ± 12.6





Team          12.00
Strength       1.18
Avg_Points     6.34
Avg_Rank      13.13
Avg_Reward    14.21
Std_Reward    12.59
Top-4 %        0.01
Top-12 %       0.37
Name: 12, dtype: float64

### RL Agent

In [38]:
print(f"Baselines average reward : WinAll = {baseline_reward_wa:.1f} ± {baseline_std_wa:.1f}, "
      f"LoseFirst = {baseline_reward_lf:.1f} ± {baseline_std_lf:.1f}")

Baselines average reward : WinAll = 17.0 ± 13.7, LoseFirst = 14.2 ± 12.6


In [36]:
agent = DQNAgent(env,
                 hidden_dims=[256,128,64],
                 dropout= 0.3,
                 lr = 0.0005,
                 buffer_size=10000,
                 epsilon_decay=0.9995)
agent.train(n_episodes=20000)
agent.evaluate(n_episodes=2000)

--- Training in progress ---
Episode 100/20000 | Avg Reward: 6.55 ± 8.54 |  Avg nb gambits played 3.76 ± 1.13 | Epsilon: 0.763 | Failed episodes: 1
Episode 200/20000 | Avg Reward: 6.54 ± 8.51 |  Avg nb gambits played 3.44 ± 1.28 | Epsilon: 0.561 | Failed episodes: 4
Episode 300/20000 | Avg Reward: 6.63 ± 7.62 |  Avg nb gambits played 2.93 ± 1.28 | Epsilon: 0.416 | Failed episodes: 4
Episode 400/20000 | Avg Reward: 12.20 ± 12.82 |  Avg nb gambits played 2.03 ± 1.11 | Epsilon: 0.307 | Failed episodes: 6
Episode 500/20000 | Avg Reward: 12.33 ± 13.60 |  Avg nb gambits played 1.65 ± 0.99 | Epsilon: 0.227 | Failed episodes: 6
Episode 600/20000 | Avg Reward: 13.36 ± 11.57 |  Avg nb gambits played 1.40 ± 1.05 | Epsilon: 0.168 | Failed episodes: 8
Episode 700/20000 | Avg Reward: 12.82 ± 11.00 |  Avg nb gambits played 1.26 ± 0.98 | Epsilon: 0.124 | Failed episodes: 9
Episode 800/20000 | Avg Reward: 15.21 ± 12.34 |  Avg nb gambits played 0.98 ± 1.01 | Epsilon: 0.092 | Failed episodes: 10
Episode 

### Verbosed simulation

In [22]:
# Simulate tournament
final_standings = env.simulate_tournament(verbose= True)

print("\nFinal standings (team_id, points, opponent_average):")
for rank, (team_id, points, reward, opp_avg,strength) in enumerate(final_standings, 1):

    print(f"Rank {rank}: Team {team_id} - Strength {strength:.2f} - Points: {points} (reward {reward}) - Opponent Avg: {opp_avg:.2f}")


--- Simulating round n°1 ---
Game : Team 8 (points : 0, strength : 2.12) vs Team 2 (points : 0, strength : 3.53) : Draw
Game : Team 5 (points : 0, strength : 2.82) vs Team 1 (points : 0, strength : 3.76) : Team 1 wins
Game : Team 12 (points : 0, strength : 1.18) vs Team 11 (points : 0, strength : 1.41) : Team 11 wins
Game : Team 17 (points : 0, strength : 0.00) vs Team 9 (points : 0, strength : 1.88) : Team 9 wins
Game : Team 0 (points : 0, strength : 4.00) vs Team 10 (points : 0, strength : 1.65) : Team 10 wins
Game : Team 3 (points : 0, strength : 3.29) vs Team 6 (points : 0, strength : 2.59) : Draw
Game : Team 15 (points : 0, strength : 0.47) vs Team 14 (points : 0, strength : 0.71) : Team 14 wins
Game : Team 13 (points : 0, strength : 0.94) vs Team 7 (points : 0, strength : 2.35) : Team 7 wins
Game : Team 4 (points : 0, strength : 3.06) vs Team 16 (points : 0, strength : 0.24) : Team 4 wins
--- Simulating round n°2 ---
Game : Team 1 (points : 3, strength : 3.76) vs Team 4 (points :