In [1]:
import pandas as pd
import numpy as np

from rl_lib.swiss_round.environment import SwissRoundEnv
from rl_lib.swiss_round.agent import DQNAgent

%reload_ext autoreload
%autoreload 2

# Utils 

In [2]:
def probability_tables(team_strengths, max_draw_probability):
    index = range(len(team_strengths))
    wps = []
    dps = []
    lps = []
    for ts1 in team_strengths :
        twps = []
        tdps = []
        tlps = []
        for ts2 in team_strengths :
            strength_diff = ts1 - ts2
            tmp_win_prob = 1 / (1 + np.exp(-strength_diff))
            tmp_loss_prob = 1 / (1 + np.exp(+strength_diff))
            tmp_draw_prob = max_draw_probability * np.exp(-abs(strength_diff))
            # Softmax
            win_prob = tmp_win_prob / (tmp_win_prob + tmp_draw_prob + tmp_loss_prob)
            draw_prob = tmp_draw_prob / (tmp_win_prob + tmp_draw_prob + tmp_loss_prob)
            loss_prob = tmp_loss_prob / (tmp_win_prob + tmp_draw_prob + tmp_loss_prob)  
            
            twps.append(win_prob)
            tdps.append(draw_prob)
            tlps.append(loss_prob)
        wps.append(twps)
        dps.append(tdps)
        lps.append(tlps)
    return pd.DataFrame(wps, index=index, columns = index), pd.DataFrame(dps, index=index, columns = index),pd.DataFrame(lps, index=index, columns = index)        

In [3]:
def check_probability(team_strengths, max_draw_probability):
    wp, dp, lp = probability_tables(team_strengths=team_strengths, max_draw_probability=max_draw_probability)
    df = wp+dp+lp
    values_array = df.to_numpy()
    target_array = np.full_like(values_array, 1)

    return np.allclose(values_array, target_array, rtol=10e-5, atol=10e-8)

# Environement

In [4]:
n_teams = 18
max_str = 8
threshold_ranks = [4,12]
agent_id = threshold_ranks[-1] +1
team_strengths = np.linspace(max_str,0,n_teams)
#team_strengths = [4 * 0.9 ** i for i in range(n_teams)]
mdp = 0.4

print(np.array(team_strengths).round(2))
wp, dp, lp = probability_tables(team_strengths=team_strengths, max_draw_probability=mdp)
pd.concat([wp.loc[[agent_id]].rename(index={agent_id:'Win'}),
           dp.loc[[agent_id]].rename(index={agent_id:'Draw'}),
           lp.loc[[agent_id]].rename(index={agent_id:'Loss'}),
           ]).T.round(2)

[8.   7.53 7.06 6.59 6.12 5.65 5.18 4.71 4.24 3.76 3.29 2.82 2.35 1.88
 1.41 0.94 0.47 0.  ]


Unnamed: 0,Win,Draw,Loss
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,0.01,0.0,0.99
3,0.01,0.0,0.99
4,0.01,0.01,0.98
5,0.02,0.01,0.97
6,0.04,0.01,0.95
7,0.05,0.02,0.92
8,0.08,0.04,0.88
9,0.12,0.06,0.82


In [5]:
n_rounds = 6
bonus_points = [30,20]
#Agent_id just below last threshold
n_baselines_simu = 2000
name = f'lin_{max_str}'
print(f"Valid probability set-up : {check_probability(team_strengths, mdp)}")

env = SwissRoundEnv(
    n_teams=n_teams,
    n_rounds=n_rounds,
    name=name,
    team_strengths=team_strengths,
    threshold_ranks=threshold_ranks,
    bonus_points=bonus_points,
    agent_id=agent_id,
    max_draw_probability=mdp
)

Valid probability set-up : True


#### Detailed tables

# Simulations

### Logs

In [10]:
cpdf = pd.read_csv('../logs/calmip_logs.csv', index_col=0)
locdf = pd.read_csv('../logs/exp_logs.csv', index_col=0)
ldf = pd.concat([cpdf,locdf]).reset_index(drop=True)

In [11]:
display_cols = ['exp_id', 'env_name','agent_id', 'n_episodes','train_epochs','lr','use_lr_scheduler','gradient_clipping','epsilon_end','epsilon_decay','activation', 'layers', 'dropout', 'avg_test_rewards',
       'std_test_rewards', 'avg_test_gambits', 'std_test_gambits']

In [12]:
query = (ldf['env_name'] == name) & (ldf['n_episodes']>=800) & (ldf['agent_id'] == agent_id)

In [13]:
ldf[query].sort_values(by='avg_test_rewards', ascending=False)[display_cols].head(20)

Unnamed: 0,exp_id,env_name,agent_id,n_episodes,train_epochs,lr,use_lr_scheduler,gradient_clipping,epsilon_end,epsilon_decay,activation,layers,dropout,avg_test_rewards,std_test_rewards,avg_test_gambits,std_test_gambits
48,loc_18,lin_8,13,12000,1.0,0.0003,True,1.0,0.04,0.9995,relu,256_64,0.3,13.124,11.335723,0.108,0.316759
47,loc_17,lin_8,13,12000,1.0,0.0003,True,1.0,0.04,0.9995,tanh,256_64,0.3,12.354,10.72449,0.255,0.449416


### Baselines Simulations

In [21]:
simulation_wa = env.simulate_n_tournaments(n_baselines_simu, n_cores = 32,  policy = 'win_all', display_results=True)
simulation_lf = env.simulate_n_tournaments(n_baselines_simu,n_cores = 32, policy = 'lose_first',display_results=False)
baseline_reward_wa = simulation_wa.loc[agent_id,'Avg_Reward'] 
baseline_std_wa = simulation_wa.loc[agent_id,'Std_Reward'] 
baseline_reward_lf = simulation_lf.loc[agent_id,'Avg_Reward'] 
baseline_std_lf = simulation_lf.loc[agent_id,'Std_Reward'] 


Simulating tournaments with win_all policy: 100%|██████████| 2000/2000 [00:16<00:00, 123.96it/s]



Simulation Results (from 1970 tournaments):
Team | Strength | Avg Points | Avg Rank | Avg Reward | Std Reward | Top-4 % | Top-12 % | 
----------------------------------------------------------------------------------------
   0 |     8.00 |      15.02 |     2.10 |      62.84 |       9.17 |  92.74% | 100.00% | 
   1 |     7.53 |      13.82 |     2.86 |      58.61 |      12.95 |  82.64% | 100.00% | 
   2 |     7.06 |      12.77 |     3.74 |      53.56 |      15.45 |  69.34% |  99.95% | 
   3 |     6.59 |      11.92 |     4.68 |      47.71 |      16.75 |  52.79% |  99.75% | 
   4 |     6.12 |      11.21 |     5.54 |      42.48 |      16.45 |  38.27% |  98.93% | 
   5 |     5.65 |      10.61 |     6.38 |      37.69 |      14.92 |  24.92% |  98.02% | 
   6 |     5.18 |      10.04 |     7.27 |      34.49 |      14.20 |  17.97% |  95.28% | 
   7 |     4.71 |       9.52 |     8.17 |      31.01 |      12.03 |   9.70% |  92.89% | 
   8 |     4.24 |       8.99 |     9.03 |      28.00 |      10.9

Simulating tournaments with lose_first policy: 100%|██████████| 2000/2000 [00:15<00:00, 132.41it/s]




In [27]:
bldf = pd.concat([pd.DataFrame(simulation_wa.loc[agent_id]).T.rename({agent_id :'Win All'}),
                  pd.DataFrame(simulation_lf.loc[agent_id]).T.rename({agent_id :'Lose First'})])

print(f"Baselines average reward : WinAll = {baseline_reward_wa:.1f} ± {baseline_std_wa:.1f}, "
      f"LoseFirst = {baseline_reward_lf:.1f} ± {baseline_std_lf:.1f}")
bldf.drop(columns = ['Team', 'Strength']).round(2)

Baselines average reward : WinAll = 12.8 ± 10.8, LoseFirst = 11.9 ± 10.8


Unnamed: 0,Avg_Points,Avg_Rank,Avg_Reward,Std_Reward,Top-4 %,Top-12 %
Win All,6.32,13.43,12.78,10.83,0.0,0.32
Lose First,6.08,13.79,11.88,10.77,0.0,0.29


# Appendix

### Environment probability tables

In [17]:
np.allclose((wp+dp+lp).values, np.ones(wp.shape))

True

In [6]:
wp.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.36,0.49,0.62,0.73,0.82,0.88,0.92,0.95,0.97,0.98,0.99,0.99,1.0,1.0,1.0,1.0,1.0,1.0
1,0.31,0.36,0.49,0.62,0.73,0.82,0.88,0.92,0.95,0.97,0.98,0.99,0.99,1.0,1.0,1.0,1.0,1.0
2,0.24,0.31,0.36,0.49,0.62,0.73,0.82,0.88,0.92,0.95,0.97,0.98,0.99,0.99,1.0,1.0,1.0,1.0
3,0.18,0.24,0.31,0.36,0.49,0.62,0.73,0.82,0.88,0.92,0.95,0.97,0.98,0.99,0.99,1.0,1.0,1.0
4,0.12,0.18,0.24,0.31,0.36,0.49,0.62,0.73,0.82,0.88,0.92,0.95,0.97,0.98,0.99,0.99,1.0,1.0
5,0.08,0.12,0.18,0.24,0.31,0.36,0.49,0.62,0.73,0.82,0.88,0.92,0.95,0.97,0.98,0.99,0.99,1.0
6,0.05,0.08,0.12,0.18,0.24,0.31,0.36,0.49,0.62,0.73,0.82,0.88,0.92,0.95,0.97,0.98,0.99,0.99
7,0.04,0.05,0.08,0.12,0.18,0.24,0.31,0.36,0.49,0.62,0.73,0.82,0.88,0.92,0.95,0.97,0.98,0.99
8,0.02,0.04,0.05,0.08,0.12,0.18,0.24,0.31,0.36,0.49,0.62,0.73,0.82,0.88,0.92,0.95,0.97,0.98
9,0.01,0.02,0.04,0.05,0.08,0.12,0.18,0.24,0.31,0.36,0.49,0.62,0.73,0.82,0.88,0.92,0.95,0.97


In [7]:
dp.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.29,0.2,0.13,0.09,0.06,0.04,0.02,0.01,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.2,0.29,0.2,0.13,0.09,0.06,0.04,0.02,0.01,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.13,0.2,0.29,0.2,0.13,0.09,0.06,0.04,0.02,0.01,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0
3,0.09,0.13,0.2,0.29,0.2,0.13,0.09,0.06,0.04,0.02,0.01,0.01,0.01,0.0,0.0,0.0,0.0,0.0
4,0.06,0.09,0.13,0.2,0.29,0.2,0.13,0.09,0.06,0.04,0.02,0.01,0.01,0.01,0.0,0.0,0.0,0.0
5,0.04,0.06,0.09,0.13,0.2,0.29,0.2,0.13,0.09,0.06,0.04,0.02,0.01,0.01,0.01,0.0,0.0,0.0
6,0.02,0.04,0.06,0.09,0.13,0.2,0.29,0.2,0.13,0.09,0.06,0.04,0.02,0.01,0.01,0.01,0.0,0.0
7,0.01,0.02,0.04,0.06,0.09,0.13,0.2,0.29,0.2,0.13,0.09,0.06,0.04,0.02,0.01,0.01,0.01,0.0
8,0.01,0.01,0.02,0.04,0.06,0.09,0.13,0.2,0.29,0.2,0.13,0.09,0.06,0.04,0.02,0.01,0.01,0.01
9,0.01,0.01,0.01,0.02,0.04,0.06,0.09,0.13,0.2,0.29,0.2,0.13,0.09,0.06,0.04,0.02,0.01,0.01


In [8]:
lp.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.36,0.31,0.24,0.18,0.12,0.08,0.05,0.04,0.02,0.01,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0
1,0.49,0.36,0.31,0.24,0.18,0.12,0.08,0.05,0.04,0.02,0.01,0.01,0.01,0.0,0.0,0.0,0.0,0.0
2,0.62,0.49,0.36,0.31,0.24,0.18,0.12,0.08,0.05,0.04,0.02,0.01,0.01,0.01,0.0,0.0,0.0,0.0
3,0.73,0.62,0.49,0.36,0.31,0.24,0.18,0.12,0.08,0.05,0.04,0.02,0.01,0.01,0.01,0.0,0.0,0.0
4,0.82,0.73,0.62,0.49,0.36,0.31,0.24,0.18,0.12,0.08,0.05,0.04,0.02,0.01,0.01,0.01,0.0,0.0
5,0.88,0.82,0.73,0.62,0.49,0.36,0.31,0.24,0.18,0.12,0.08,0.05,0.04,0.02,0.01,0.01,0.01,0.0
6,0.92,0.88,0.82,0.73,0.62,0.49,0.36,0.31,0.24,0.18,0.12,0.08,0.05,0.04,0.02,0.01,0.01,0.01
7,0.95,0.92,0.88,0.82,0.73,0.62,0.49,0.36,0.31,0.24,0.18,0.12,0.08,0.05,0.04,0.02,0.01,0.01
8,0.97,0.95,0.92,0.88,0.82,0.73,0.62,0.49,0.36,0.31,0.24,0.18,0.12,0.08,0.05,0.04,0.02,0.01
9,0.98,0.97,0.95,0.92,0.88,0.82,0.73,0.62,0.49,0.36,0.31,0.24,0.18,0.12,0.08,0.05,0.04,0.02


### RL Agent

In [34]:
print(f"Baselines average reward : WinAll = {baseline_reward_wa:.1f} ± {baseline_std_wa:.1f}, "
      f"LoseFirst = {baseline_reward_lf:.1f} ± {baseline_std_lf:.1f}")

Baselines average reward : WinAll = 12.5 ± 11.0, LoseFirst = 11.6 ± 10.5


In [14]:
agent = DQNAgent(env,
                 hidden_dims=[128,32],
                 dropout= 0.3,
                 lr = 0.0005,
                 buffer_size=10000,
                 epsilon_decay=0.9995,
                 activation='tanh',
                 train_epochs=8,
                 max_grad_norm=1,
                 use_lr_scheduler=True,
                 n_train_episodes=6000,
                 n_test_episodes=2000)
agent.train()
agent.evaluate()


--- Training in progress ---


KeyboardInterrupt: 

### Verbosed simulation

In [22]:
# Simulate tournament
final_standings = env.simulate_tournament(verbose= True)

print("\nFinal standings (team_id, points, opponent_average):")
for rank, (team_id, points, reward, opp_avg,strength) in enumerate(final_standings, 1):

    print(f"Rank {rank}: Team {team_id} - Strength {strength:.2f} - Points: {points} (reward {reward}) - Opponent Avg: {opp_avg:.2f}")


--- Simulating round n°1 ---
Game : Team 8 (points : 0, strength : 2.12) vs Team 2 (points : 0, strength : 3.53) : Draw
Game : Team 5 (points : 0, strength : 2.82) vs Team 1 (points : 0, strength : 3.76) : Team 1 wins
Game : Team 12 (points : 0, strength : 1.18) vs Team 11 (points : 0, strength : 1.41) : Team 11 wins
Game : Team 17 (points : 0, strength : 0.00) vs Team 9 (points : 0, strength : 1.88) : Team 9 wins
Game : Team 0 (points : 0, strength : 4.00) vs Team 10 (points : 0, strength : 1.65) : Team 10 wins
Game : Team 3 (points : 0, strength : 3.29) vs Team 6 (points : 0, strength : 2.59) : Draw
Game : Team 15 (points : 0, strength : 0.47) vs Team 14 (points : 0, strength : 0.71) : Team 14 wins
Game : Team 13 (points : 0, strength : 0.94) vs Team 7 (points : 0, strength : 2.35) : Team 7 wins
Game : Team 4 (points : 0, strength : 3.06) vs Team 16 (points : 0, strength : 0.24) : Team 4 wins
--- Simulating round n°2 ---
Game : Team 1 (points : 3, strength : 3.76) vs Team 4 (points :