In [4]:
%matplotlib widget

import math
import random
import time
import numpy as np
import matplotlib.pyplot as plt

# Import Tensorflow 2.0
import tensorflow as tf

from time import sleep

from ipycanvas import Canvas, RoughCanvas, hold_canvas

from scipy.stats import truncnorm

gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:
    for gpu in gpus:
        print(gpu)
        tf.config.experimental.set_memory_growth(gpu, True)
        
#!jupyter labextension list

PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [25]:
class Env0():
    
    def __init__(self, num_steps):
        self.num_steps = num_steps
        self.num_trees = 2
        self.reset()

    def reset(self):
        
        self.trees = [(random.randrange(100,300)), (random.randrange(400,600))]
        self.fruits = [(90.,20),(105.,10)] #array of position, remaining lifetime
        self.agent = random.randrange(0., 800.)
        self.countfruits = 0
        self.score = 0.
        self.timestep = 0
        return self.obs()

    def obs(self):
        o = [(xt - self.agent) for (xt) in self.trees]
        return o
    
        
    def step(self, action):

        self.timestep += 1
        
        #remove old fruits
        self.fruits = [(x,t-1) for (x,t) in self.fruits if t>0]

        #add new ones
        for (xt) in self.trees:
            if random.random()>.9:
                x = xt + random.randrange(-50,50,1)
                t = random.randrange(50,100)
                self.fruits.append((x,t))

        #move the agent
        direction = -1 if action==0 else +1
        distance = direction * random.randint(1,5)
        self.agent += distance  #change the position
        
        #calculate the reward
        
        self.score -= abs(distance)  #consume energy, negative reward, for moving
        
        found = [i for i in range(len(self.fruits)) if self.fruits[i][0]==self.agent]
        self.score += 100 * len(found) #it is possible that several fruits have the same location
        self.countfruits += len(found)
        self.fruits = [f for i,f in enumerate(self.fruits) if i not in found]
        
        o = self.obs()
        reward = self.score
        terminated = True if self.timestep>=self.num_steps else False
        if self.agent<0 or self.agent>1000:
            terminated = True
        
        return o, reward, terminated   

    
    def init_canvas(self):
        #canvas = Canvas(width=1000, height=200)
        self.canvas = RoughCanvas(width=1000, height=200)
        self.canvas.font = "10px serif"
        display(self.canvas)        
        
        
    def update_canvas(self, sleeptime=0.02):
        #draw the scene
        with hold_canvas():
            # Clear the old animation step
            self.canvas.clear()

            y = 100
            size = 5

            self.canvas.stroke_text("time:%d"%self.timestep, 10, 10)
            #self.canvas.stroke_text("#fruits:%d"%len(self.fruits), 10, 30)
            self.canvas.stroke_text("#score:%d"%self.score, 10, 30)
            self.canvas.stroke_text("#found:%d"%self.countfruits, 10, 50)

            self.canvas.stroke_style = "blue"
            for (x) in self.trees:
                self.canvas.stroke_rect(x, y, size, size)

            self.canvas.fill_style = "red"
            xs = [x for (x,t) in self.fruits]
            ys = [100]*len(xs)
            self.canvas.fill_circles(xs, ys, size) #use vectorized version

            self.canvas.stroke_style = "green"
            self.canvas.stroke_rect(self.agent, y, size, size)


        # Animation frequency ~50Hz = 1./50. seconds
        if sleeptime>0:
            sleep(sleeptime)


    def play(self, model):
        self.init_canvas()
        terminated=False
        obs = self.reset()
        while not terminated:
            action = choose_action(model, obs)
            obs, reward, terminated = self.step(action)
            self.update_canvas(sleeptime=0)        
            
e = Env(500)

e.init_canvas()
terminated=False
obs = e.reset()
while not terminated:
    
    #policy:
    xt = obs[0]
    if abs(xt)>50:
        action = 1 if xt>0 else 0
    else:
        action = random.randint(0,1)
    
    #action = random.randint(0,1)

    obs, reward, terminated = e.step(action)
    e.update_canvas()
     

RoughCanvas(height=200, width=1000)

In [95]:
class Env():
    
    def __init__(self, num_steps):
        self.num_steps = num_steps
        self.num_trees = 2
        self.num_fruits = 40
        self.reset()

    def reset(self):
        
        self.trees = [(random.randrange(100,300)), (random.randrange(400,600))]

        self.fruits = np.zeros((self.num_fruits, 3))
        fruits = []
        for xt in self.trees:
            fruits.extend(xt+np.round(100*truncnorm.rvs(-1, 1, size=int(self.num_fruits/self.num_trees))))
        self.fruits[:,0]=fruits
        self.fruits[:,1]=np.random.uniform(10,50,self.num_fruits)  #number of time steps before fruit's location changes
        self.fruits[:int(self.num_fruits/2),2]=self.trees[0] #remember the tree where the next fruit will fall
        self.fruits[int(self.num_fruits/2):,2]=self.trees[1]
        
        self.agent = random.randrange(0., 800.)
        self.countfruits = 0
        self.score = 0.
        self.timestep = 0
        
        return self.obs()

    def obs(self):
        o = [(xt - self.agent) for (xt) in self.trees]
        return o
    
        
    def step(self, action):

        self.timestep += 1
        
        #replace old fruit
        self.fruits[:,1] -= 1 #age all fruits
        old = np.where(self.fruits[:,1]<=0, True, False) #find fruits with expired shelf life...
        self.fruits[:,0] = np.where(old, self.fruits[:,2]+np.round(100*truncnorm.rvs(-1,1)), self.fruits[:,0]) #drop a new fruit near the same tree
        self.fruits[:,1] = np.where(old, np.random.uniform(10,50), self.fruits[:,1]) #and give it a new shelf life

        #move the agent
        direction = -1 if action==0 else +1
        distance = direction * random.randint(1,5)
        self.agent += distance  #change the position
        
        #calculate the reward
        
        self.score -= abs(distance)  #consume energy, negative reward, for moving
        
        found = np.where(self.agent==self.fruits[:,0], True, False)
        count = np.count_nonzero(found==True)
        self.score += 1000 * count #it is possible that several fruits have the same location
        self.countfruits += count
        if count>0:
            self.fruits[:,0] = np.where(found, self.fruits[:,2]+np.round(100*truncnorm.rvs(-1,1)), self.fruits[:,0]) #drop a new fruit near the same tree
            self.fruits[:,1] = np.where(found, np.random.uniform(10,50), self.fruits[:,1]) #and give it a new shelf life
        
        o = self.obs()
        reward = self.score
        terminated = True if self.timestep>=self.num_steps else False
        if self.agent<0 or self.agent>1000:
            terminated = True
            score = -10000 #discourage suicidal policies
        
        return o, reward, terminated   

    
    def init_canvas(self):
        #canvas = Canvas(width=1000, height=200)
        self.canvas = RoughCanvas(width=1000, height=200)
        self.canvas.font = "10px serif"
        display(self.canvas)        
        
        
    def update_canvas(self, sleeptime=0.02):
        #draw the scene
        with hold_canvas():
            # Clear the old animation step
            self.canvas.clear()

            y = 100
            size = 5

            self.canvas.stroke_text("time:%d"%self.timestep, 10, 10)
            #self.canvas.stroke_text("#fruits:%d"%len(self.fruits), 10, 30)
            self.canvas.stroke_text("#score:%d"%self.score, 10, 30)
            self.canvas.stroke_text("#found:%d"%self.countfruits, 10, 50)

            self.canvas.stroke_style = "blue"
            for (x) in self.trees:
                self.canvas.stroke_rect(x, y, size, size)

            self.canvas.fill_style = "red"
            xs = self.fruits[:,0]
            ys = [100]*self.num_fruits
            self.canvas.fill_circles(xs, ys, size) #use vectorized version

            self.canvas.stroke_style = "green"
            self.canvas.stroke_rect(self.agent, y, size, size)


        # Animation frequency ~50Hz = 1./50. seconds
        if sleeptime>0:
            sleep(sleeptime)


    def play(self, model):
        self.init_canvas()
        terminated=False
        obs = self.reset()
        while not terminated:
            action = choose_action(model, obs)
            obs, reward, terminated = self.step(action)
            self.update_canvas(sleeptime=0)        
            
e = Env(500)

e.init_canvas()
terminated=False
obs = e.reset()
while not terminated:
    
    #policy:
    xt = obs[0]
    if abs(xt)>50:
        action = 1 if xt>0 else 0
    else:
        action = random.randint(0,1)
    
    #action = random.randint(0,1)

    obs, reward, terminated = e.step(action)
    e.update_canvas()
     

RoughCanvas(height=200, width=1000)

In [96]:
def create_rl_model(n_actions):
    
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(units=32, activation='relu'),
        tf.keras.layers.Dense(units=n_actions, activation=None)  #returns logits (un-normalized log-proba for each action)
    ])
    return model

#a logbook to remember observations, actions and rewards for an entire episode
class Memory:
    def __init__(self):
        self.clear()
    
    def clear(self):
        self.observations=[]
        self.actions=[]
        self.rewards=[]
        
    def add_to_memory(self, new_obs, new_action, new_reward):
        self.observations.append(new_obs)
        self.actions.append(new_action)
        self.rewards.append(new_reward)
        
    def __len__(self):
        return len(self.actions)

def choose_action(model, observation, single=True):

    # add batch dimension to the observation if only a single example was provided
    observation = np.expand_dims(observation, axis=0) if single else observation
    logits = model.predict(observation, verbose=0)
    action = tf.random.categorical(logits, num_samples=1)  #randomly pick an action - tf's categorical takes unornmalized log proba as input
    action = action.numpy().flatten()
    return action[0] if single else action

def normalize(x):
    x -= np.mean(x)
    x /= np.std(x)
    return x.astype(np.float32)

def discount_rewards(rewards, gamma=0.95):
    discounted_rewards = np.zeros_like(rewards)
    R=0
    for t in reversed(range(0, len(rewards))):
        R = R*gamma + rewards[t]
        discounted_rewards[t]=R
    return normalize(discounted_rewards)

def compute_loss(logits, actions, rewards):
    neg_logprob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=actions)
    loss = tf.reduce_mean(neg_logprob*rewards)
    return loss

def train_step(model, loss_function, optimizer, observations, actions, discounted_rewards):

    with tf.GradientTape() as tape:
        prediction = model(observations)
        loss = loss_function(prediction, actions, discounted_rewards)
    
    grads = tape.gradient(loss, model.trainable_variables)
    
    grads, _ = tf.clip_by_global_norm(grads, 2.0)
    
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
    return loss

In [97]:

env = Env(500) #each episode lasts 500 time steps
model = create_rl_model(n_actions=2)

#env.play(model)


In [98]:
memory = Memory()
learning_rate = 0.001
optimizer = tf.keras.optimizers.Adam(learning_rate)
obs = env.reset()
memory.clear()
terminated = False
while not terminated:
    action = choose_action(model, obs)
    next_obs, reward, terminated = env.step(action)
    memory.add_to_memory(obs, action, reward)
    obs = next_obs
    

loss = train_step(model, compute_loss, optimizer, 
           observations=np.vstack(memory.observations),
           actions=np.array(memory.actions),
           discounted_rewards = discount_rewards(memory.rewards))

score = reward
loss

<tf.Tensor: shape=(), dtype=float32, numpy=0.20321432>

In [109]:
memory = Memory()

learning_rate = 0.001
optimizer = tf.keras.optimizers.Adam(learning_rate)
num_episodes=100

start = time.time()

for i_episode in range(num_episodes):
    obs = env.reset()
    memory.clear()
    terminated = False
    
    #run the episode, keeping the model constant
    while not terminated:
        action = choose_action(model, obs)
        next_obs, reward, terminated = env.step(action)
        memory.add_to_memory(obs, action, reward)
        obs = next_obs

    score = reward
    
    loss = train_step(model, compute_loss, optimizer, 
               observations=np.vstack(memory.observations),
               actions=np.array(memory.actions),
               discounted_rewards = discount_rewards(memory.rewards))
    
    end = time.time()
    print(i_episode, "score:", score, "loss:", loss.numpy(), "time remaining:", (num_episodes-i_episode+1)*(end-start)/(i_episode+1)/60)


0 score: 29507.0 loss: 0.13791351 time remaining: 45.027863121032716
1 score: 16500.0 loss: 0.06365309 time remaining: 44.29214537143707
2 score: 23481.0 loss: 0.11886824 time remaining: 43.52128487825394
3 score: 18514.0 loss: 0.04055286 time remaining: 42.918258583545686
4 score: 21502.0 loss: 0.14537612 time remaining: 42.36098740975062
5 score: 25484.0 loss: 0.020437947 time remaining: 41.85362351735433
6 score: 37493.0 loss: 0.20149529 time remaining: 41.47461810282299
7 score: 22495.0 loss: 0.16536476 time remaining: 40.99527920484543
8 score: 25563.0 loss: 0.2041778 time remaining: 40.50240138371785
9 score: 38498.0 loss: 0.028194826 time remaining: 40.04234384377798
10 score: 20460.0 loss: 0.021055793 time remaining: 39.7683297178962
11 score: 13493.0 loss: 0.01543826 time remaining: 39.2891383767128
12 score: 21545.0 loss: 0.18341012 time remaining: 38.72470948971235
13 score: 27474.0 loss: 0.15639982 time remaining: 38.18946231206258
14 score: 14476.0 loss: 0.19646773 time re

In [108]:
model_file = 'rl_'+time.strftime("%Y%m%d-%H%M%S")
print(model_file)
model.save(model_file)

rl_20221219-112929
INFO:tensorflow:Assets written to: rl_20221219-112929\assets


In [None]:
env = Env(500)
env.play(model)

RoughCanvas(height=200, width=1000)