In [1]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel

In [2]:
print(spaces.MultiDiscrete([3,3]).sample()[1])


2


In [3]:
server_build='b3-serverbuild-forcebased'
gfx_build = 'b2-force_based_movement_1472elo'

In [4]:
channel = EngineConfigurationChannel()

env = UnityEnvironment(file_name=f'{server_build}/SoccerBot' , side_channels=[channel])
channel.set_configuration_parameters(time_scale=100.0)


In [5]:
env.reset()

In [6]:
behavior_names = list(env.behavior_specs.keys())
print(behavior_names)

print(env.behavior_specs[behavior_names[1]][1])

['Soccer Bot?team=0', 'Soccer Bot?team=1']
Continuous: 0, Discrete: (3, 3)


In [7]:
print(env.behavior_specs[behavior_names[1]][1])

action_tuple = env.behavior_specs[behavior_names[0]][1].empty_action(1)


Continuous: 0, Discrete: (3, 3)


In [8]:
EPOCH = 1000
STEPS = 2048


elo_t0=1200.0
elo_t1=1200.0

avg_reward:dict =dict()
avg_reward[behavior_names[0]]=0.0
avg_reward[behavior_names[1]]=0.0

In [9]:
class SoccerBotEnv(gym.Env):


    def __init__(self , env: UnityEnvironment , behavior_name: str ) -> None:
        super(SoccerBotEnv , self).__init__()

        self.myenv = env;
        self.mybehavior_name=behavior_name

        self.action_spec = self.myenv.behavior_specs[self.mybehavior_name][1]
        self.act_tuple = self.action_spec.empty_action(1)

        self.action_space = spaces.MultiDiscrete([3,3,3,3]  , dtype=np.int32)
        
        self.observation_space = spaces.Box(low=0,high=1 , shape=(448,) , dtype=np.float32)


    def reset(self, seed=None, options=None):
        super().reset(seed=seed,options=options)
        self.myenv.reset()
        return np.zeros((448,),dtype=np.float32) , {} 

    def step(self, action):
        
        self.myenv.step()


        (decstept0 , termstept0) = env.get_steps(behavior_name=self.mybehavior_name)
        idx: int = 0
        
        for agent in decstept0.agent_id:
            self.act_tuple.add_discrete(np.array(action[2*idx : 2*(idx+1)]).reshape((1,2)))
            env.set_action_for_agent(self.mybehavior_name,agent,action=self.act_tuple)
            idx+=1
        
        obs = None 
        reward =0.0
        terminated : bool=False
        truncated = False
        info ={}
        if decstept0.obs[0].shape == (2,56) and decstept0.obs[1].shape == (2,168) :
            obs = np.concatenate((decstept0.obs[0],decstept0.obs[1]) , axis=1 , dtype=np.float32)
            
        else:
            terminated=True
            obs = np.concatenate((termstept0.obs[0],termstept0.obs[1]) , axis=1, dtype=np.float32)
        
        if( len(decstept0.group_reward) != 2):
            reward = termstept0.group_reward.mean()
        else:
            reward = decstept0.group_reward.mean()
        
        # if reward != 0.0 : print(f'Behavior: {self.mybehavior_name} Reward: {reward}')
        avg_reward[self.mybehavior_name]+=reward
        
        obs = np.reshape(obs ,newshape=(448,))
        return (
            obs,
            float(reward),
            terminated,
            truncated,
            info
        )
    def render(self):
        pass

    def close(self):
        self.myenv.close()

In [10]:
from stable_baselines3.common.env_checker import check_env

In [11]:
gymenv_t0 = SoccerBotEnv(env=env , behavior_name=behavior_names[0])
gymenv_t1 = SoccerBotEnv(env=env , behavior_name=behavior_names[1])
# If the environment don't follow the interface, an error will be thrown
# # check_env(gymenv_t0, warn=True)
# check_env(gymenv_t1, warn=True)

In [12]:
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.env_util import make_vec_env


In [13]:
custom_hyperparams = {
    'n_steps': 2048,             # Number of steps to run for each environment per update
    'batch_size': 64,            # Minibatch size
    'n_epochs': 10,              # Number of epochs to update the policy
    'learning_rate': 3e-4,       # Learning rate
    'gamma': 0.99,               # Discount factor
    'clip_range': 0.2,           # Clipping parameter for PPO
    'gae_lambda': 0.95,          # Factor for trade-off of bias vs variance for GAE
    'ent_coef': 0.01,            # Entropy coefficient
    'vf_coef': 0.5,              # Value function coefficient
    'max_grad_norm': 0.5,        # Maximum norm for gradient clipping
}

In [14]:
import torch 
import os

In [15]:
def calculate_elo(agent_rating, opponent_rating, agent_reward, opponent_reward, k=32):

    # Determine the outcome based on rewards
    if agent_reward > opponent_reward:
        outcome = 1  # Agent wins
    elif agent_reward < opponent_reward:
        outcome = 0  # Agent loses
    else:
        outcome = 0.5  # Draw

    # Calculate the expected scores
    expected_agent = 1 / (1 + 10 ** ((opponent_rating - agent_rating) / 400))
    expected_opponent = 1 / (1 + 10 ** ((agent_rating - opponent_rating) / 400))

    # Calculate the new ratings
    new_agent_rating = agent_rating + k * (outcome - expected_agent)
    new_opponent_rating = opponent_rating + k * ((1 - outcome) - expected_opponent)

    return new_agent_rating, new_opponent_rating

In [16]:
policy_kwargs: dict = dict(activation_fn=torch.nn.ReLU,
                     net_arch=dict(pi=[512, 512], vf=[512, 512]))

In [17]:
# model_t0 = PPO("MlpPolicy", gymenv_t0, verbose=0 , policy_kwargs=policy_kwargs , learning_rate=1e-4 ,batch_size=2048 , ent_coef=0.01)
# model_t0_param = PPO("MlpPolicy", gymenv_t0, verbose=0 , policy_kwargs=policy_kwargs , learning_rate=1e-4 ,batch_size=2048 ,ent_coef=0.01)
# model_t1 = PPO("MlpPolicy", gymenv_t1, verbose=0 , policy_kwargs=policy_kwargs , learning_rate=1e-4 ,batch_size=2048,ent_coef=0.01)


In [18]:
algo = 'A2C'
filepath = f'team_0/{algo}'


In [19]:
model_t0 = A2C("MlpPolicy", gymenv_t0, verbose=0 , policy_kwargs=policy_kwargs , learning_rate=1e-5 , tensorboard_log=f'team_0/{algo}/log' )
model_t0_param = A2C("MlpPolicy", gymenv_t0, verbose=0 , policy_kwargs=policy_kwargs , learning_rate=1e-5  )
model_t1 = A2C("MlpPolicy", gymenv_t1, verbose=0 , policy_kwargs=policy_kwargs , learning_rate=1e-5 )



In [20]:
model_t1.set_parameters(model_t0.get_parameters())

In [21]:
from stable_baselines3.common.callbacks import BaseCallback


class Team_Callback(BaseCallback):

    def __init__(self, behavior_name: str ,  model ,verbose: int = 1   ):
        super().__init__(verbose)
        self.behavior_name : str = behavior_name
        self.model = model

    def _on_training_start(self) -> None:

        pass

    def _on_rollout_start(self) -> None:

        pass

    def _on_step(self) -> bool:
        (decstep , termstep) = env.get_steps(behavior_name=self.behavior_name)
        
        if decstep.obs[0].shape == (2,56) and decstep.obs[1].shape == (2,168) :
            obs = np.concatenate((decstep.obs[0],decstep.obs[1]) , axis=1 , dtype=np.float32)            
        else:
            obs = np.concatenate((termstep.obs[0],termstep.obs[1]) , axis=1, dtype=np.float32)
        
        obs = np.reshape(obs ,newshape=(448,))
        act_t1,_ = self.model.predict(observation=obs , deterministic=True)
        
        
        idx: int = 0
        for agent in decstep.agent_id:
            action_tuple.add_discrete(np.array(act_t1[2*idx : 2*(idx+1)]).reshape((1,2)))
            env.set_action_for_agent(self.behavior_name,agent,action=action_tuple)
            idx+=1
        
        if( len(decstep.group_reward) != 2):
            reward = termstep.group_reward.mean()
        else:
            reward = decstep.group_reward.mean()
        
        # if reward != 0.0 : print(f'Behavior: {self.behavior_name} Reward: {reward}')
        avg_reward[self.behavior_name]+=reward;
        return True

    def _on_rollout_end(self) -> None:

        pass

    def _on_training_end(self) -> None:
        pass

In [22]:
t0_callback = Team_Callback(behavior_names[0], model_t0)
t1_callback = Team_Callback(behavior_names[1], model_t1)

In [23]:
elo_dict:dict = dict()

In [24]:
WINDOW = 10

for epoch in range(EPOCH):

    avg_reward[behavior_names[0]]=0.0
    avg_reward[behavior_names[1]]=0.0
    

    
    if epoch > 0 : 

        rand_model = np.random.randint( int(max(0 , epoch - WINDOW)) ,epoch)
        elo_t1 = elo_dict[rand_model]
        model_t0_param.set_parameters(f"{filepath}/{algo}_soccerbot_t0_{rand_model}" , exact_match=False)
        model_t1.set_parameters(load_path_or_dict=model_t0_param.get_parameters())
    
    if epoch > WINDOW:
        os.remove(f"{filepath}/{algo}_soccerbot_t0_{epoch - WINDOW - 1}.zip")
    
    model_t0 = model_t0.learn(total_timesteps=STEPS , callback= t1_callback)
    elo_t0 , elo_t1 = calculate_elo(elo_t0 , elo_t1 , avg_reward[behavior_names[0]] , avg_reward[behavior_names[1]] , 16)
    
    model_t0.save(f"{filepath}/{algo}_soccerbot_t0_{epoch}")
    print(f'Epoch: {epoch} ELO: {elo_t0}  Reward_team0: {avg_reward[behavior_names[0]]} Reward_team1: {avg_reward[behavior_names[1]]}')
    elo_dict[epoch] = elo_t0

    # if EPOCH % 2 == 0 :
    # else:
    #     avg_reward[behavior_names[0]]=0.0
    #     avg_reward[behavior_names[1]]=0.0

    #     model_t1 = model_t1.learn(total_timesteps=STEPS , callback= t0_callback)
    #     elo_t1 , elo_t0 = calculate_elo(elo_t1 , elo_t0 , avg_reward[behavior_names[1]] , avg_reward[behavior_names[0]] , 16)
    #     print(f'ELO: {elo_t1} Reward team0: {avg_reward[behavior_names[0]]} Reward team1: {avg_reward[behavior_names[1]]}')



Epoch: 0 ELO: 1208.0  Reward_team0: 1.0056000053882599 Reward_team1: -1.0056000053882599
Epoch: 1 ELO: 1200.0  Reward_team0: -0.35839998722076416 Reward_team1: 0.35839998722076416
Epoch: 2 ELO: 1208.1841742594847  Reward_team0: 0.10279999673366547 Reward_team1: -0.10279999673366547
Epoch: 3 ELO: 1208.179933490838  Reward_team0: 0.0 Reward_team1: 0.0
Epoch: 4 ELO: 1199.9916183552216  Reward_team0: -0.946799979545176 Reward_team1: 0.946799979545176
Epoch: 5 ELO: 1207.9916183552216  Reward_team0: 1.282399982213974 Reward_team1: -1.282399982213974
Epoch: 6 ELO: 1207.8076369880596  Reward_team0: 0.0 Reward_team1: 0.0
Epoch: 7 ELO: 1207.6276968620675  Reward_team0: 0.0 Reward_team1: 0.0
Epoch: 8 ELO: 1215.636269455341  Reward_team0: 0.22519999742507935 Reward_team1: -0.22519999742507935
Epoch: 9 ELO: 1207.4646073286779  Reward_team0: -0.5260000228881836 Reward_team1: 0.5260000228881836
Epoch: 10 ELO: 1199.2925619292612  Reward_team0: -0.188400000333786 Reward_team1: 0.188400000333786
Epoch: 