# Stable Baselines Exploratory

In [1]:
import stable_baselines3
import gym
import numpy as np

# Environement
import gym
from gym import spaces

# Evaluate the environement
from stable_baselines3.common.evaluation import evaluate_policy

# Agent
from stable_baselines3 import A2C
# Policy
from stable_baselines3.ppo import MlpPolicy

## Environement

In [None]:
class CustomEnv(gym.Env):
    """Custom Environment that follows gym interface"""
    metadata = {'render.modes': ['human']}

    def __init__(self, grid_size=(10,10),nspecies=255):
        super(CustomEnv, self).__init__()
        
        self.grid_size = grid_size
        self.nspecies = nspecies
        assert nspecies < 256
        nx,ny = grid_size
        
        # Define action space
        self.action_space = spaces.Box(low=np.array([0,0,0]),
                                      high=np.array([nx,ny,nspecies]),
                                      shape=(3,))        
        # Define observation space
        self.observation_space = spaces.Box(low=0,
                                            high=self.nspecies,
                                            shape=grid_size, dtype=np.uint8)
        
        self._state = np.zeros((10,10))
        self.done = False

    def step(self, action):       
        if self._state[action] != 0 :
            reward = -1 #--------> penalty already filled
        
        self._state[action] = 1
        
        reward = 1
        
        info={}
        
        return np.array([self._state]).astype(np.uint8), reward, done, info
        
            

        
        
        return observation, reward, done, info
    
    def reset(self):
         observation = self._state.copy()
        return observation
    
    def render(self, mode='human'):
        ...
    def close (self):
        ...

In [None]:
# Instantiate the env
env = CustomEnv(arg1, ...)

## Agent

In [None]:
model = A2C(MlpPolicy, env, verbose=0)

Evaluate random Agent

In [None]:
# Use a separate environement for evaluation
eval_env = CustomEnv(arg1, ...)

# Random Agent, before training
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

Train the agent

In [None]:
# Train the agent for 10000 steps
model.learn(total_timesteps=10000)

In [None]:
# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

In [4]:
grid_size = (10,10)
nx,ny = grid_size
nspecies = 255


# Define observation space with shape = X, Y, Channel
observation = spaces.Box(low=0,
                                high=nspecies,
                                shape=grid_size, dtype=np.uint8)

In [3]:
action_space = spaces.Box(low=np.array([0,0,0]),
                              high=np.array([nx,ny,nspecies]),
                              shape=(3,))



In [6]:
observation.shape

(10, 10)

In [65]:
action_space

Box([0. 0. 0.], [ 10.  10. 255.], (3,), float32)

In [7]:
state = np.zeros((10,10))

In [10]:
state

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [13]:
state[action_space]

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

## Neighbours

In [None]:
class Tuile:
    def __init__(self, action):
        # veg : price, productivity, capacity
        self.SCORE = {1:[10,1,1], 2:[8,1,1], 3:[5,1,1], 4:[2,1,1], 5:[6,1,1]}, 
        self.NEMESIS = {1:[], 2:[1,5], 3:[], 4:[3], 5:[]}
        self.rewards = {'PENALTY_NEMESIS':-5}

In [None]:
    def affinity(self): 

        x = action_[0]
        y = action_[1]
        
        if y-1 >=0:
            north = self._state[x, y-1]
        north = None
        if x+1 >=0 and y-1 >=0:
            north_east = self._state[x+1, y-1]
        north_east = None
        if x+1 >=0:
            east = self._state[x+1, y]
        east = None
        if x+1 >=0 and y+1 >=0:
            south_east = self._state[x+1, y+1]
        south_east = None
        if y+1 >= 0:
            south = self._state[x, y+1]
        south = None
        if x-1 >=0 and y+1 >=0:
            south_west = self._state[x-1, y+1]  
        south_west = None
        if x-1 >=0:
            west = self._state[x-1, y]
        west = None
        if x-1 >=0 and y-1 >=0:
            north_west = self._state[x-1, y-1]
        north_west = None
                                          
        for nem in self.NEMESIS[action_[2]]:
            if nem in [north, north_east, east, south_east,
                       south, south_west, west, north_west]:
                reward = self.rewards['penalty_nemesis']
        
    def revenue(self):
        for key, value in self.SCORE.items() :
            if key == action_[2]:
                reward = self.rewards['new_value'] + value[0]
                