In [1]:
import tensorflow as tf
print(tf.__version__)
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.models import load_model
from collections import deque
import numpy as np
from tqdm import tqdm
import random
import os
import matplotlib.pyplot as plt
%matplotlib
tf.compat.v1.disable_eager_execution()

1.15.2
Using matplotlib backend: Qt5Agg


In [2]:
#Important Parameters
AGENT_MEMORY = 25000
UPDATE_AFTER_EPISODES = 5
HIDDEN_LAYERS= [64, 128]
BATCH_SIZE = 64
DISCOUNT = 0.997
LEARNING_RATE = 0.001
LOSS_FUNCTION = 'mse'
AVG_OF_LAST = 100
TASK = 0

#Unimportant Parameters
OPTIMIZER = Adam(lr=LEARNING_RATE)
RENDER_AFTER_EPISODES = 25
q_new = lambda q_max, reward: (reward + DISCOUNT * q_max)
TOTAL_EPISODES = 1000
MAX_EPSILON = 1
EPSILON_DECAY = 0.9975
MIN_EPSILON = 0.0005
DROP_PRECENT = 0.25
PLAY = True
PLAY_EPISODES = 100
SHOW = False
PLOT = True

#Kernel Parameters
SAVE_AT_AVG = 400
ONLINE = False
TPU = True
if ONLINE:
    from google.colab import drive
    drive.mount("/content/drive")

#User Settings
ONLINE_PATH1 = f"/content/drive/My\ Drive/task{TASK}.py"
ONLINE_PATH2 = "/usr/local/lib/python3.6/dist-packages/gym/envs/classic_control/cartpole.py"
ONLINE_SAVE_PATH = "/content/drive/My Drive"
OFFLINE_PATH1 = f"/home/anany/ML_project/task{TASK}.py"
OFFLINE_PATH2 = "/home/anany/anaconda3/envs/ML/lib/python3.6/site-packages/gym/envs/classic_control/cartpole.py"
OFFLINE_SAVE_PATH = "/home/anany/ML_project"
PATH1 = ONLINE_PATH1 if ONLINE else OFFLINE_PATH1
PATH2 = ONLINE_PATH2 if ONLINE else OFFLINE_PATH2
SAVE = ONLINE_SAVE_PATH if ONLINE else OFFLINE_SAVE_PATH
os.system('cp '+ PATH1 + ' ' + PATH2)
with open(PATH2, 'r') as f:
    print(f"Task file --> {f.readline()}")

Task file --> #TASK 0



In [None]:
if TPU and ONLINE:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.experimental.TPUStrategy(resolver)

In [None]:
class Agent:

    def __init__(self):
        self.state = env.reset().tolist()
        self.done = False
        self.total_score = 0
        self.score_per_episode = 0
        self.all_scores = [0]
        self.render = True
        self.epsilon = MAX_EPSILON
        self.avg_score = 0
        self.present_q_model = Sequential()
        self.present_q_model.add(Dense(HIDDEN_LAYERS[0],
                                       input_shape=(env.observation_space.low.size,),
                                       activation='relu'))
        self.present_q_model.add(Dropout(DROP_PRECENT))
        for index in range(1, len(HIDDEN_LAYERS)):
            self.present_q_model.add(Dense(HIDDEN_LAYERS[index],activation='relu'))
            self.present_q_model.add(Dropout(DROP_PRECENT))
        self.present_q_model.add(Dense(env.action_space.n,activation='linear'))
        self.present_q_model.compile(loss=LOSS_FUNCTION, optimizer=OPTIMIZER,
                                     metrics=['accuracy'])
        
        self.future_q_model = Sequential()
        self.future_q_model.add(Dense(HIDDEN_LAYERS[0],
                                      input_shape=(env.observation_space.low.size,),
                                      activation='relu'))
        self.future_q_model.add(Dropout(DROP_PRECENT))
        for index in range(1, len(HIDDEN_LAYERS)):
            self.future_q_model.add(Dense(HIDDEN_LAYERS[index],activation='relu'))
            self.future_q_model.add(Dropout(DROP_PRECENT))
        self.future_q_model.add(Dense(env.action_space.n,activation='linear'))
        self.future_q_model.compile(loss=LOSS_FUNCTION, optimizer=OPTIMIZER, 
                                    metrics=['accuracy'])

        self.future_q_model.set_weights(self.present_q_model.get_weights())
        
        self.memory = deque(maxlen=AGENT_MEMORY)
        self.episodes = 0
        print(self.present_q_model.summary())
        print(self.future_q_model.summary())
        
    def train(self):
        if len(self.memory) < BATCH_SIZE :
            return
        
        batch = random.sample(self.memory, BATCH_SIZE)
        present_q_values = self.present_q_model.predict(np.array([x[1] for x in batch]))
        #future_q2_values = self.present_q_model.predict(np.array([x[3] for x in batch]))
        future_q_values = self.future_q_model.predict(np.array([x[3] for x in batch]))
        for index, slot in enumerate(batch):
            if not slot[4]:
                #action_pred = np.argmax(future_q2_values[index])
                #q_future_max = future_q_values[index][action_pred]
                q_future_max = np.max(future_q_values[index])
                qnew = q_new(q_future_max, slot[2])
            else:
                qnew = slot[2]
            
            present_q_values[index][slot[0]] = qnew
            
        X = np.array([slot[1] for slot in batch])
        Y = present_q_values
        
        history = self.present_q_model.fit(X, Y, batch_size=BATCH_SIZE,
                                           shuffle=False, verbose = 0)
        
        if self.episodes % UPDATE_AFTER_EPISODES == 0:
            self.future_q_model.set_weights(self.present_q_model.get_weights())
                
    def next_action(self):
        q_values = self.present_q_model.predict(np.array([self.state]))
        
        if np.random.random() > self.epsilon:
            action = np.argmax(q_values)
        else:
            action = np.random.randint(0, env.action_space.n)
            
        new_state, reward, done, info = env.step(action)
        self.total_score += reward
        self.score_per_episode += reward
        
        if done:
            self.all_scores.append(self.score_per_episode)
            self.score_per_episode = 0
            if PLOT:
                plt.plot(self.all_scores,'g-')
                plt.pause(0.05)
            self.episodes +=1
            if self.episodes % RENDER_AFTER_EPISODES == 0:
                self.render = True
            else:
                self.render = False
                
            if self.episodes % AVG_OF_LAST == 0:
                self.avg_score = self.total_score/AVG_OF_LAST
                print(f"Average Score = {self.avg_score}")
                self.total_score = 0
                        
        if self.render:
            #env.render()
            None
            
            
        self.memory.append([action, self.state, reward, new_state, done])
        self.state = new_state.tolist() if not done else env.reset().tolist()
        
        self.train()
        if self.epsilon > MIN_EPSILON:
            self.epsilon *= EPSILON_DECAY
            self.epsilon = max(MIN_EPSILON, self.epsilon)
        
        return done
        

In [None]:
import gym
env = gym.make('CartPole-v1')
agent = Agent()
for episode in tqdm(range(TOTAL_EPISODES)):
    while True:
        stop = agent.next_action()
        if stop:
            if agent.avg_score >= SAVE_AT_AVG:
                agent.present_q_model.save(f"{SAVE}/model-{TASK}-{agent.avg_score}.h5")
            break

avg_val = 0
for i in range(len(agent.all_scores)):
    avg_val = max(sum(agent.all_scores[i:i+AVG_OF_LAST])/AVG_OF_LAST, avg_val)
    
print(avg_val)

In [2]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.models import load_model
from collections import deque
import numpy as np
from tqdm import tqdm
import random
import os
import matplotlib.pyplot as plt
%matplotlib
tf.compat.v1.disable_eager_execution()
#Important Parameters
AGENT_MEMORY = 20000
UPDATE_AFTER_EPISODES = 5
HIDDEN_LAYERS= [64, 128]
BATCH_SIZE = 32
DISCOUNT = 0.997
LEARNING_RATE = 0.001
LOSS_FUNCTION = 'mse'
AVG_OF_LAST = 100
TASK = 1

#Unimportant Parameters
OPTIMIZER = Adam(lr=LEARNING_RATE)
RENDER_AFTER_EPISODES = 25
q_new = lambda q_max, reward: (reward + DISCOUNT * q_max)
TOTAL_EPISODES = 1000
MAX_EPSILON = 1
EPSILON_DECAY = 0.9975
MIN_EPSILON = 0.0005
DROP_PRECENT = 0.25
PLAY = True
PLAY_EPISODES = 100
SHOW = False
PLOT = True
TRAIN = True

#Kernel Parameters
SAVE_AT_AVG = 400
ONLINE = False
TPU = True
if ONLINE:
    from google.colab import drive
    drive.mount("/content/drive")

#User Settings
ONLINE_PATH1 = f"/content/drive/My\ Drive/task{TASK}.py"
ONLINE_PATH2 = "/usr/local/lib/python3.6/dist-packages/gym/envs/classic_control/cartpole.py"
ONLINE_SAVE_PATH = "/content/drive/My Drive"
OFFLINE_PATH1 = f"/home/anany/ML_project/task{TASK}.py"
OFFLINE_PATH2 = "/home/anany/anaconda3/envs/ML/lib/python3.6/site-packages/gym/envs/classic_control/cartpole.py"
OFFLINE_SAVE_PATH = "/home/anany/ML_project"
PATH1 = ONLINE_PATH1 if ONLINE else OFFLINE_PATH1
PATH2 = ONLINE_PATH2 if ONLINE else OFFLINE_PATH2
SAVE = ONLINE_SAVE_PATH if ONLINE else OFFLINE_SAVE_PATH
os.system('cp '+ PATH1 + ' ' + PATH2)
with open(PATH2, 'r') as f:
    print(f"Task file --> {f.readline()}")

def plot(scores=[]):
    if PLOT:
        plt.clf()
        avg_scores = [sum(scores[:index+1])/(index+1) for index, 
                      score in enumerate(scores)]
        avg_score = sum(scores)/len(scores)
        x_val = [0,len(scores)-1]
        y_val = [avg_score, avg_score]
        plt.plot(scores,'g-', label='current score')
        plt.plot(x_val, y_val,'r-', label='average score')
        plt.plot(avg_scores,'b-', label='average scores')
        if len(scores) >= AVG_OF_LAST:
            avg_of_last = sum(scores[-1*AVG_OF_LAST:])/AVG_OF_LAST
            y_val = [avg_of_last, avg_of_last]
            plt.plot(x_val, y_val, 'k-', label=f'average scores of last {AVG_OF_LAST}')
        plt.xlabel('Epochs--->')
        plt.ylabel('Score--->')
        plt.legend()
        plt.pause(0.05)
    
#PLEASE RUN THE FIRST TWO CODE BLOCKS BEFORE RUNNING THIS BLOCK
if PLAY:
    import gym
    env = gym.make('CartPole-v1')
    score = input("Enter the score of the model you want to load ")
    trained_agent = load_model(f"{SAVE}/model-{TASK}-{score}.h5")
    state = env.reset()
    avg_score = 0
    for _ in tqdm(range(PLAY_EPISODES)):
        env.reset()
        while True:
            action = np.argmax(trained_agent.predict([state.tolist()]))
            state, reward, done, info = env.step(action)
            avg_score += reward
            if SHOW:
                env.render()
            if done:
                break
    print(avg_score/PLAY_EPISODES)



Using matplotlib backend: Qt5Agg
Task file --> #TASK 1

CartPoleEnv - Version 0.2.0, Noise case: 1
Enter the score of the model you want to load 500.0


OSError: SavedModel file does not exist at: /home/anany/ML_project/model-1-500.0.h5/{saved_model.pbtxt|saved_model.pb}

In [2]:
from spinup import td3_tf1 as ppo
import tensorflow as tf
import gym

env_fn = lambda : gym.make('CartPole-v1')

ac_kwargs = dict(hidden_sizes=[64,64], activation=tf.nn.relu)

logger_kwargs = dict(output_dir='logs_trash', exp_name='experiment_name')

ppo(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=5000, epochs=250, logger_kwargs=logger_kwargs)

[32;1mLogging data to logs_trash/progress.txt[0m
[36;1mSaving config:
[0m
{
    "ac_kwargs":	{
        "activation":	"relu",
        "hidden_sizes":	[
            64,
            64
        ]
    },
    "act_noise":	0.1,
    "actor_critic":	"mlp_actor_critic",
    "batch_size":	100,
    "env_fn":	"<function <lambda> at 0x7fb320c25950>",
    "epochs":	250,
    "exp_name":	"experiment_name",
    "gamma":	0.99,
    "logger":	{
        "<spinup.utils.logx.EpochLogger object at 0x7fb3203bf7b8>":	{
            "epoch_dict":	{},
            "exp_name":	"experiment_name",
            "first_row":	true,
            "log_current_row":	{},
            "log_headers":	[],
            "output_dir":	"logs_trash",
            "output_file":	{
                "<_io.TextIOWrapper name='logs_trash/progress.txt' mode='w' encoding='UTF-8'>":	{
                    "mode":	"w"
                }
            }
        }
    },
    "logger_kwargs":	{
        "exp_name":	"experiment_name",
        "output_di

AttributeError: 'Discrete' object has no attribute 'high'

In [5]:
gym.make('CartPole-v1').action_space.

Discrete(2)