In [None]:
#Credits and Thanks
#https://www.youtube.com/watch?v=bD6V3rcr_54
#https://www.gymlibrary.dev/content/environment_creation/

In [None]:
!pip install tensorflow==2.3.0
!pip install gym
!pip install keras
!pip install keras-rl2
!pip install stable-baselines3

In [40]:
from gym import Env
from gym.spaces import Discrete, Box
from rl.agents import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import BoltzmannQPolicy
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np
import random


#Glass of Water

In [None]:
class GlassOfWater(Env):
    def __init__(self):
        # 0,1,2,3 -> put water
        self.action_space = Discrete(4)

        # glass filling percentage
        self.observation_space = Box(low=np.array([0]), high=np.array([100]))

        self.best_reward = 1000

        # how much water the person wants
        self.preference = random.randint(40,80)
        self.state = 0
        self.glass_length = 5
        
    def step(self, action):
        water = 0
        if action == 0:
          water += 2 * random.uniform(0.5, 1.5)
        elif action == 1:
          water += 5 * random.uniform(0.5, 1.5)
        elif action == 2:
          water += 10 * random.uniform(0.5, 1.5)
        else:
          water += 20 * random.uniform(0.5, 1.5)
        self.state += water
        self.glass_length -=1

        done = False
        reward = 0
        info = {}
        if self.state >= self.preference:
          done = True
          reward += self.best_reward - (self.state - self.preference)*15
        elif self.glass_length == 0:
          done = True
        else:
          done = False
          reward = self.state - self.preference
        
        return self.state, reward, done, info

    def render(self):
      pass

    def reset(self):
      self.preference = random.randint(40,80)
      self.state = 0
      self.glass_length = 5
      return self.state     
env = GlassOfWater()

In [None]:
episodes = 14
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} - Score:{}'.format(episode, score))

Episode:1 - Score:-184.91360858546136
Episode:2 - Score:-252.27794540058915
Episode:3 - Score:-114.72656173355657
Episode:4 - Score:-179.78512520514792
Episode:5 - Score:-256.3751694098613
Episode:6 - Score:914.9084458409337
Episode:7 - Score:-194.11317478040883
Episode:8 - Score:-119.9327830273551
Episode:9 - Score:-180.45832462936113
Episode:10 - Score:-175.3842227775886
Episode:11 - Score:-112.5761304787597
Episode:12 - Score:-144.13679479797347
Episode:13 - Score:-209.36442630861802
Episode:14 - Score:856.9569203749451


In [None]:
states = env.observation_space.shape
actions = env.action_space.n

# RL Agent

In [None]:
def build_model(states, actions):
    model = Sequential()    
    model.add(Dense(24, activation='relu', input_shape=states))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model
model = build_model(states, actions)

In [None]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=30, target_model_update=1e-2)
    return dqn

In [None]:
dqn = build_agent(model, actions)
dqn.compile(Adam(), metrics=['mae'])
dqn.fit(env, nb_steps=50000)

In [None]:
env.reset()

In [None]:
scores = dqn.test(env, nb_episodes=50, visualize=False)
print(f"Mean = {np.mean(scores.history['episode_reward'])}")

Testing for 50 episodes ...
Episode 1: reward: 874.746, steps: 4
Episode 2: reward: 896.347, steps: 3
Episode 3: reward: 669.093, steps: 4
Episode 4: reward: 705.724, steps: 3
Episode 5: reward: 733.878, steps: 3
Episode 6: reward: 676.011, steps: 4
Episode 7: reward: 890.349, steps: 3
Episode 8: reward: 677.555, steps: 4
Episode 9: reward: 805.981, steps: 3
Episode 10: reward: 781.968, steps: 3
Episode 11: reward: 859.798, steps: 3
Episode 12: reward: 919.264, steps: 3
Episode 13: reward: 862.949, steps: 3
Episode 14: reward: 822.272, steps: 3
Episode 15: reward: 833.868, steps: 3
Episode 16: reward: 873.779, steps: 2
Episode 17: reward: 905.316, steps: 3
Episode 18: reward: 786.581, steps: 5
Episode 19: reward: 848.537, steps: 3
Episode 20: reward: 844.711, steps: 3
Episode 21: reward: 685.528, steps: 3
Episode 22: reward: 657.347, steps: 4
Episode 23: reward: 790.974, steps: 3
Episode 24: reward: 765.872, steps: 4
Episode 25: reward: 940.274, steps: 2
Episode 26: reward: 561.280, st

# Stable Baselines

In [None]:
env.reset()

In [None]:
model = DQN('MlpPolicy', DummyVecEnv([lambda: env]), verbose = 0)
model.learn(total_timesteps=100000)

In [43]:
evaluate_policy(model, DummyVecEnv([lambda: env]), n_eval_episodes=50)

(782.826147825718, 104.70294774946038)