In [2]:
!pip install -q tensorflow-gpu==2.0.0-alpha0
!pip install gym
!pip install gym[box2d]
!pip install -U -q PyDrive

[K    100% |████████████████████████████████| 332.1MB 50kB/s 
[K    100% |████████████████████████████████| 3.0MB 8.7MB/s 
[K    100% |████████████████████████████████| 419kB 13.6MB/s 
[K    100% |████████████████████████████████| 61kB 31.6MB/s 
Collecting box2d-py>=2.3.5 (from gym[box2d])
[?25l  Downloading https://files.pythonhosted.org/packages/06/bd/6cdc3fd994b0649dcf5d9bad85bd9e26172308bbe9a421bfc6fdbf5081a6/box2d_py-2.3.8-cp36-cp36m-manylinux1_x86_64.whl (448kB)
[K    100% |████████████████████████████████| 450kB 9.8MB/s 
Installing collected packages: box2d-py
Successfully installed box2d-py-2.3.8
[K    100% |████████████████████████████████| 993kB 23.6MB/s 
[?25h  Building wheel for PyDrive (setup.py) ... [?25ldone
[?25h

In [0]:
import tensorflow as tf
import gym
import numpy as np
import cv2 as cv
from IPython.display import clear_output
import matplotlib.pyplot as plt

from sklearn import preprocessing

import pickle

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
class ReplayBuffer:
    def __init__(self, seed):
        self.buffer = []
        self.random_generator = np.random.RandomState(seed=seed)
        self.max_size = 1000000
        self.index = -1

    def append(self, cur_state, action, next_state, reward, done):
        if done:
            final = 1
        else:
            final = 0

        self.index = (self.index + 1) % self.max_size
        if self.index >= len(self.buffer):
            self.buffer.append([cur_state, action, next_state, reward, final])
        else:
            self.buffer[self.index] = [cur_state, action, next_state, reward, final]

    def get_size(self):
        return len(self.buffer)

    def get_batch(self, size):
        mask = self.random_generator.randint(0, len(self.buffer), size)
        
        return [self.buffer[id] for id in mask]

In [0]:
class QFunction:
    def __init__(self, num_of_inputs, num_of_actions, image_size=None, layer_units_inputs=[1500, 1000], lr=0.0001):
        
        
        layer_units = layer_units_inputs
             
        inputs = tf.keras.Input(shape=(num_of_inputs,))
        x = tf.keras.layers.Dense(units=layer_units[0])(inputs)
        x = tf.keras.layers.PReLU()(x)
        for elem in range(1, len(layer_units)):
            x = tf.keras.layers.Dense(units=layer_units[elem])(x)
            x = tf.keras.layers.PReLU()(x)
        outputs = tf.keras.layers.Dense(units=num_of_actions)(x)

        self.Q_function = tf.keras.Model(inputs=inputs, outputs=outputs)
        self.Q_function.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(lr))

    def predict(self, state):
        return self.Q_function.predict(state)

    def train_step(self, cur_states, targets):
        loss = self.Q_function.train_on_batch(cur_states, targets)
        return loss

In [0]:
class Agent:
    
    def __init__(self, env_name='BipedalWalkerHardcore-v2', number_of_steps=10000000, discretization_steps=5,
                 batch_size=32, seed=42, episodes_to_average=40, learning_rate=0.0001, scaler = None, learning_steps=0):

        # environment
        self.env_name = env_name
        self.env = gym.make(env_name)
        
        # state normalisation - doesn't work here - params unbounded
        self.state_scaler = None
        #self.get_states_stats()
        
        # learning params
        self.number_of_steps = number_of_steps
        self.batch_size = batch_size


        # random generator seed
        self.seed = seed

        # ReplayBuffer for observations
        self.replay_buffer = ReplayBuffer(self.seed)
        
        # discretization
        self.discretization_steps = discretization_steps
        self.low = self.env.action_space.low
        self.action_steps = (self.env.action_space.high - self.env.action_space.low) / discretization_steps
        self.num_of_actions = len(self.low)
        
        self.clip_action = 0.7

        # Q function
        
        self.num_of_inputs = len(self.env.observation_space.high)
        self.num_of_outputs = discretization_steps ** self.num_of_actions 
        if self.discretization_steps % 2 == 0:
            self.num_of_outputs += 1
        self.Q_function = QFunction(self.num_of_inputs, self.num_of_outputs, lr=learning_rate, layer_units_inputs=[1500, 1300])
        self.gamma = 0.99
        
        self.learning_steps = learning_steps
        
        # epsilon greedy policy
        self.epsilon_start = 0.95
        self.epsilon_min = 0.01
        self.epsilon = self.epsilon_start
        self.epsilon_decay_factor = 0.99991 # decay ~e times in 10000 steps

        # training
        self.losses = []
        self.episodes_rewards = []
        self.episodes_end_index = []
        self.reward_before_fail = []
        self.average_reward_before_fail = []
        self.episodes_to_average = episodes_to_average
        self.eps = []
        
        # file ids
        self.buffer_file_id = '14ThpPSMN-xL3zp12Vqd-ksVT_qTI-yXf'
        self.model_file_id = '15trlpnUvX-N4EYqL4H0P2H_q4aAF9Gdo'
        
        # scaler
        self.scaler = scaler
        
        
    def prepropcess_state(self, state):
        if self.scaler:
            return self.scaler.transform(state.reshape(1, -1))[0]
        else:
            return state
        
    # convert number of discreet action to continuous space.
    def num_to_action(self, action_num):
        assert self.num_of_outputs > action_num
        
        cur_actions = np.zeros(self.num_of_actions)
        
        if self.discretization_steps % 2 == 0 and action_num == self.num_of_outputs - 1:
            return cur_actions
        
        i = 0
        while action_num > 0:
            cur_actions[i] += self.action_steps[i] * (action_num % self.discretization_steps )
            action_num = action_num // self.discretization_steps
            i += 1
        for i in range(self.num_of_actions):
            cur_actions[i] += self.low[i] + self.action_steps[i] * 0.5
            cur_actions[i] *= self.clip_action
        return cur_actions

    # calculate targets
    def targets(self, batch): 
        cur_state, action, next_state, reward, done = zip(*batch)
        cur_state = np.array(cur_state)
        action = np.array(action)
        next_state = np.array(next_state)
        reward = np.array(reward)
        done = np.array(done)

        
        target_val = self.Q_function.predict(cur_state)
        target = np.max(self.Q_function.predict(next_state), axis=1) * (1 - done) + reward
        for i in range(len(target_val)): 
            target_val[i][action[i]] = target[i]
        return cur_state, target_val

    def update_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay_factor)

    # epsilon greedy action
    def sample_action(self, state, epsilon):
        random_num = np.random.random()
        if random_num < epsilon:
            action_num = np.random.randint(0, self.num_of_outputs)
        else:
            q_values = self.Q_function.predict([state])
            action_num = np.argmax(q_values)
        return action_num, self.num_to_action(action_num)

    def plot_data(self, episode_rewards_per_plot=700, losses_per_plot=30000, plot_eps=True):
        clear_output(True)
        plt.figure(figsize=(20,5))
        
        plt.subplot(131)
        st = self.learning_steps
        plt.title('Rewards after {} steps'.format(st))
        n = min(len(self.episodes_rewards), episode_rewards_per_plot)
        episodes_index = np.arange(len(self.episodes_rewards)-n, len(self.episodes_rewards))
        
        plt.plot(episodes_index, self.episodes_rewards[-n:])
        plt.plot(episodes_index, np.zeros(n))
        plt.plot(episodes_index, self.average_reward_before_fail[-n:])
        
        plt.subplot(132)
        plt.title('Loss')
        n = min(st, losses_per_plot)
        plt.plot(np.arange(st-n, st), self.losses[-n:])
        
        if plot_eps:
            plt.subplot(133)
            plt.title('eps')
            plt.plot(self.eps)
        plt.show()
    
    def read_colab(self, buffer_id = None, model_id = None):
        auth.authenticate_user()
        gauth = GoogleAuth()
        gauth.credentials = GoogleCredentials.get_application_default()
        drive = GoogleDrive(gauth)
        
        if buffer_id is not None:
            fname = "buffer.txt"
            file_obj = drive.CreateFile({'id': buffer_id})
            file_obj.GetContentFile(fname)
            with open (fname, 'rb') as fp:
                self.replay_buffer =  pickle.load(fp)

        if model_id is not None:
            fname = 'model.h5'
            file_obj = drive.CreateFile({'id': model_id})
            file_obj.GetContentFile(fname)
            self.Q_function.Q_function = tf.keras.models.load_model(fname)
        
        
    def write_colab(self):
        auth.authenticate_user()
        gauth = GoogleAuth()
        gauth.credentials = GoogleCredentials.get_application_default()
        drive = GoogleDrive(gauth)
        
        fname = 'buffer.txt'
        with open(fname, 'wb') as fp:
            pickle.dump(self.replay_buffer, fp)


        model_file = drive.CreateFile({'title' : fname})
        model_file.SetContentFile(fname)
        model_file.Upload()
        
        fname = 'model.h5'
        self.Q_function.Q_function.save(fname)    
        model_file = drive.CreateFile({'title' : fname})
        model_file.SetContentFile(fname)
        model_file.Upload()

        # download to google drive
        drive.CreateFile({'id': model_file.get('id')})
    
    
    # run and learn
    def run(self, continue_learning = False, set_epsilon_to_min = False):

        done = False
        cur_state = self.env.reset()
        cur_state = self.prepropcess_state(cur_state)
        
        if set_epsilon_to_min:
            self.epsilon = self.epsilon_min
        
        if not continue_learning:
            self.losses = []
            self.episodes_rewards = []
            self.episodes_end_index = []
            self.average_reward_before_fail = []
            self.before_fail = []
            self.epsilon = self.epsilon_start
            self.eps = []
            self.Q_function = QFunction(self.num_of_inputs, self.num_of_outputs)
            
        cur_episode_reward = 0
        
        for step_num in range(1, self.number_of_steps + 1):

            self.update_epsilon() # epsilon decay
            self.eps.append(self.epsilon)
            
            action_num, action = self.sample_action(tf.expand_dims(cur_state, 0), self.epsilon)
            
            next_state, reward, done, _ = self.env.step(action)            
            next_state = self.prepropcess_state(next_state)
            cur_episode_reward += reward

            self.replay_buffer.append(cur_state, action_num, next_state, reward, done)
            cur_state = next_state
            
            if self.replay_buffer.get_size() >= self.batch_size:
                batch = self.replay_buffer.get_batch(self.batch_size)
                states, target_qs = self.targets(batch)
                
                loss = self.Q_function.train_step(states, target_qs)
                self.learning_steps += 1
                self.losses.append(loss)
            
            if done:
                self.episodes_rewards.append(cur_episode_reward + 100) # to clear indentify
                self.episodes_end_index.append(len(self.losses) - 1)
                self.average_reward_before_fail.append(np.mean(self.episodes_rewards[-self.episodes_to_average:]))
                cur_episode_reward = 0
                cur_state = self.env.reset()
                cur_state = self.prepropcess_state(cur_state)
                
                done = False
                
            if self.learning_steps % 1000 == 0 and self.learning_steps > 0:
                self.plot_data()

        self.env.close()
       


In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
        
fname = "buffer.txt"
file_obj = drive.CreateFile({'id': '14BR00UGXlwI7481G8jG6wsssihKYAy0t'})
file_obj.GetContentFile(fname)
with open (fname, 'rb') as fp:
    replay_buffer =  pickle.load(fp)
    
states = [elem[0] for elem in replay_buffer.buffer]
states = np.array(states)
scaler = preprocessing.StandardScaler().fit(states)
print(scaler.mean_)

In [0]:
agent = Agent(env_name='BipedalWalkerHardcore-v2', batch_size=128, discretization_steps=6, learning_rate=0.0001, number_of_steps=150000, scaler=None)

In [11]:
agent.read_colab( buffer_id = '1VJGHPpRi6FFHyS1-1-CEr4JAL_-Otz-X', model_id='1a0rQOC6fbO8r4rH6vc7QeIRgI5U44u1i')

W0418 08:45:28.204307 140535876597632 __init__.py:44] file_cache is unavailable when using oauth2client >= 4.0.0
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/googleapiclient/discovery_cache/__init__.py", line 36, in autodetect
    from google.appengine.api import memcache
ModuleNotFoundError: No module named 'google.appengine'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/googleapiclient/discovery_cache/file_cache.py", line 33, in <module>
    from oauth2client.contrib.locked_file import LockedFile
ModuleNotFoundError: No module named 'oauth2client.contrib.locked_file'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/googleapiclient/discovery_cache/file_cache.py", line 37, in <module>
    from oauth2client.locked_file import LockedFile
ModuleNo

In [0]:
agent.run()
agent.write_colab()

In [0]:
agent.run(continue_learning = True, set_epsilon_to_min = True)
agent.write_colab()

In [0]:
agent.run(continue_learning = True, set_epsilon_to_min = True)
agent.write_colab()

In [0]:
agent.run(continue_learning = True, set_epsilon_to_min = True)
agent.write_colab()

In [0]:
agent.run(continue_learning = True, set_epsilon_to_min = True)
agent.write_colab()

In [0]:
agent.run(continue_learning = True, set_epsilon_to_min = True)
agent.write_colab()