In [12]:
import gymnasium as gym
import random
from gymnasium.spaces import Discrete, MultiDiscrete
import numpy as np
import time
from gymnasium.wrappers import EnvCompatibility
from IPython.display import clear_output
class VacuumCleaner(gym.Env):
    def __init__(self, env_config=None):
        x = np.array([3,3,3,3],dtype=int)
        self.observation_space = MultiDiscrete(x)
        self.action_space = Discrete(5)
        self.max_episode_steps = 100
    def reset(self,seed=None,options=None):
        self.player = (1, 1) # the player starts at the top-left
        self.suck = True
        self.count=0
        self.steps =0
        self.field = np.array([
            [1,1,1,1,1,1],
            [1,0,2,0,0,1], # FFFF 
            [1,0,1,0,1,1], # FHFH
            [1,0,0,0,1,1], # FFFH
            [1,1,2,0,0,1],
            [1,1,1,1,1,1]  # HFFF
        ])
        obs = self.observation()
        info={}
        return obs,info
    def info(self):
        return []
    def observation(self):
        obs = [self.field[self.player[0]+1,self.player[1]],
                self.field[self.player[0],self.player[1]+1],
                self.field[self.player[0]-1,self.player[1]],
                self.field[self.player[0],self.player[1]-1]]
        obs = np.array(obs, dtype=int)
        return obs
                
    def reward(self):
        if self.suck:
            if self.field[self.player] == 2:
                r = 2
                self.field[self.player] = 0
                self.count +=1
            else:
                r=-0.2
        if self.count == 2 and self.player ==(0,0):
            r = 3
        else:
            r=-0.1
        return r  
    def done(self):
        return self.count == 2 and self.player == (0,0)

    def is_valid_loc(self, location):
        if self.field[location] == 1:
                return False
        else:
                return True
    def step(self, action):
        # Compute the new player location
        self.steps+=1
        if action == 0:   # left
            new_loc = (self.player[0], self.player[1]-1)
        elif action == 1: # down
            new_loc = (self.player[0]+1, self.player[1])
        elif action == 2: # right
            new_loc = (self.player[0], self.player[1]+1)
        elif action == 3: # up
            new_loc = (self.player[0]-1, self.player[1])
        elif action == 4:
            if self.suck:
                self.suck = False
            else:
                self.suck = True
            new_loc = self.player
        else:
            raise ValueError("Action must be in {0,1,2,3,4}")

        # Update the player location only if you stayed in bounds
        # (if you try to move out of bounds, the action does nothing)
        if self.is_valid_loc(new_loc):
            self.player = new_loc
        if self.steps==self.max_episode_steps:
            truncated = True
        else:
            truncated = False
        info = {}
        return (self.observation(),self.reward(),self.done(),truncated,info)

    def render(self):
        clear_output()
        for i in range(6):
            for j in range(6):
                if (i,j) == self.player:
                    if self.suck:
                        print("👾))", end="")
                    else:
                        print(" 👾 ", end="")
                elif self.field[(i,j)] == 2:
                    print(" 🦠 ", end="")
                elif self.field[(i,j)]==1:
                    print(" 🧱 ", end="")
                else:
                    print(" 🔲 ", end="")
            print()
    

In [5]:
import ray
ray.rllib.utils.check_env(VacuumCleaner())

AttributeError: module 'ray' has no attribute 'rllib'

In [3]:
from ray.tune.registry import register_env
def env_creator(env_config):
    # wrap and return an instance of your custom class
    return EnvCompatibility(VacuumCleaner())
register_env('VacuumCleaner', env_creator)

In [6]:
env = VacuumCleaner()

In [90]:
env.reset()
env.render()
while True:
    env.step(int(input()))
    env.reward()
    env.render()

    

 🧱  🧱  🧱  🧱  🧱  🧱 [1 0 0 1]

 🧱  🔲  🦠  🔲  🔲  🧱 [1 0 0 1]

 🧱  🔲  🧱  🔲  🧱  🧱 [1 0 0 1]

 🧱  🔲  🔲  🔲  🧱  🧱 [1 0 0 1]

 🧱  🧱 👾)) 🔲  🔲  🧱 [1 0 0 1]

 🧱  🧱  🧱  🧱  🧱  🧱 [1 0 0 1]



ValueError: Action must be in {0,1,2,3,4}

In [19]:
from ray.rllib.algorithms.ppo import PPO,PPOConfig
from ray.tune.logger import pretty_print

algo =  (PPOConfig()
        .resources(num_gpus=0) # i dont have gpu 
        .rollouts(num_rollout_workers=4) # with low memory, best use just one worker
        .environment(env=VacuumCleaner) # ray has built in gymnasium envs
        .framework("torch") # tensorflow also available, but i installed torch so im using this one
        .build() # not sure what this does , yet
)

mean_ = []
min_= []
max_= []
episodes = []

for i in range(50):
    result = algo.train()
    mean_.append(result['episode_reward_mean'])
    min_.append(result['episode_reward_min'])
    max_.append(result['episode_reward_max'])
    episodes.append(result['episodes_total'])

[2m[36m(RolloutWorker pid=18524)[0m 2023-01-29 12:27:20,248	INFO policy.py:1198 -- Policy (worker=1) running on CPU.
[2m[36m(RolloutWorker pid=18524)[0m 2023-01-29 12:27:20,250	INFO torch_policy_v2.py:110 -- Found 0 visible cuda devices.
[2m[36m(RolloutWorker pid=15408)[0m 2023-01-29 12:27:20,248	INFO policy.py:1198 -- Policy (worker=3) running on CPU.
[2m[36m(RolloutWorker pid=15408)[0m 2023-01-29 12:27:20,250	INFO torch_policy_v2.py:110 -- Found 0 visible cuda devices.
[2m[36m(RolloutWorker pid=12028)[0m 2023-01-29 12:27:20,250	INFO policy.py:1198 -- Policy (worker=4) running on CPU.
[2m[36m(RolloutWorker pid=12028)[0m 2023-01-29 12:27:20,250	INFO torch_policy_v2.py:110 -- Found 0 visible cuda devices.
[2m[36m(RolloutWorker pid=15772)[0m 2023-01-29 12:27:20,250	INFO policy.py:1198 -- Policy (worker=2) running on CPU.
[2m[36m(RolloutWorker pid=15772)[0m 2023-01-29 12:27:20,250	INFO torch_policy_v2.py:110 -- Found 0 visible cuda devices.
2023-01-29 12:27:20,285	

In [8]:
checkpoint_dir = algo.save('./vacuum')

In [20]:

env = VacuumCleaner()
rewards =[]
for i in range(10):
    obs, info = env.reset()
    episode_reward = 0
    terminated = truncated = False
    while not terminated and not truncated:
        env.render()
        time.sleep(1)
        action = algo.compute_single_action(obs)
        obs, reward, terminated, truncated, info = env.step(action)
        episode_reward += reward
    rewards.append(episode_reward)    
env.close()

 🧱  🧱  🧱  🧱  🧱  🧱 
 🧱  🔲  🦠  🔲  🔲  🧱 
 🧱  🔲  🧱  🔲  🧱  🧱 
 🧱  🔲  🔲  🔲  🧱  🧱 
 🧱  🧱  🔲  🔲 👾)) 🧱 
 🧱  🧱  🧱  🧱  🧱  🧱 


KeyboardInterrupt: 