In [10]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import Callback
from collections import deque
import numpy as np
from tqdm import tqdm
import random
import os

In [11]:
#Important Parameters
AGENT_MEMORY = 50000
UPDATE_AFTER_EPISODES = 5
HIDDEN_LAYERS= [128, 64]
BATCH_SIZE = 32
DISCOUNT = 0.99
LEARNING_RATE = 0.005
LOSS_FUNCTION = 'mse'
AVG_OF_LAST = 100
TASK = 1

#Unimportant Parameters
OPTIMIZER = Adam(lr=LEARNING_RATE)
RENDER_AFTER_EPISODES = 25
q_new = lambda q_max, reward: (reward + DISCOUNT * q_max)
TOTAL_EPISODES = 1000
MAX_EPSILON = 1
EPSILON_DECAY = 0.975
MIN_EPSILON = 0.001
DROP_PRECENT = 0.1

#Kernel Parameters
SAVE_AT_AVG = 400
ONLINE = False
TPU = True

In [12]:
if TPU and ONLINE:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.experimental.TPUStrategy(resolver)

In [13]:
if ONLINE:
    from google.colab import drive
    drive.mount("/content/drive")

In [14]:
if ONLINE:
    PATH1 = f"/content/drive/My\ Drive/task{TASK}.py"
    PATH2 = "/usr/local/lib/python3.6/dist-packages/gym/envs/classic_control/cartpole.py"
else :
    PATH1 = f"/home/anany/ML_project/task{TASK}.py"
    PATH2 = "/home/anany/anaconda3/envs/ML/lib/python3.6/site-packages/gym/envs/classic_control/cartpole.py"

os.system('cp '+ PATH1 + ' ' + PATH2)
with open(PATH2, 'r') as f:
    print(f.readline())

#TASK 1



In [6]:
class Agent:

    def __init__(self):
        self.state = env.reset().tolist()
        self.done = False
        self.score = 0
        self.render = True
        self.epsilon = MAX_EPSILON
        self.avg_reward = 0
        self.present_q_model = Sequential()
        self.present_q_model.add(Dense(HIDDEN_LAYERS[0],
                                       input_shape=(env.observation_space.low.size,),
                                       activation='relu'))
        self.present_q_model.add(Dropout(DROP_PRECENT))
        for index in range(1, len(HIDDEN_LAYERS)):
            self.present_q_model.add(Dense(HIDDEN_LAYERS[index],activation='relu'))
            self.present_q_model.add(Dropout(DROP_PRECENT))
        self.present_q_model.add(Dense(env.action_space.n,activation='linear'))
        self.present_q_model.compile(loss=LOSS_FUNCTION, optimizer=OPTIMIZER,
                                     metrics=['accuracy'])
        
        self.future_q_model = Sequential()
        self.future_q_model.add(Dense(HIDDEN_LAYERS[0],
                                      input_shape=(env.observation_space.low.size,),
                                      activation='relu'))
        self.future_q_model.add(Dropout(DROP_PRECENT))
        for index in range(1, len(HIDDEN_LAYERS)):
            self.future_q_model.add(Dense(HIDDEN_LAYERS[index],activation='relu'))
            self.future_q_model.add(Dropout(DROP_PRECENT))
        self.future_q_model.add(Dense(env.action_space.n,activation='linear'))
        self.future_q_model.compile(loss=LOSS_FUNCTION, optimizer=OPTIMIZER, 
                                    metrics=['accuracy'])

        self.future_q_model.set_weights(self.present_q_model.get_weights())
        
        self.memory = deque(maxlen=AGENT_MEMORY)
        self.episodes = 0
        print(self.present_q_model.summary())
        print(self.future_q_model.summary())
        
    def train(self):
        if len(self.memory) < BATCH_SIZE :
            return
        
        batch = random.sample(self.memory, BATCH_SIZE)
        present_q_values = self.present_q_model.predict([x[1] for x in batch]).tolist()
        future_q_values = self.future_q_model.predict([x[3] for x in batch]).tolist()
        for index, slot in enumerate(batch):
            if not slot[4]:
                q_future_max = np.max(future_q_values[index])
                qnew = q_new(q_future_max, slot[2])
            else:
                qnew = slot[2]
            
            present_q_values[index][slot[0]] = qnew
            
        X = [slot[1] for slot in batch]
        Y = present_q_values
        
        history = self.present_q_model.fit(X, Y, batch_size=BATCH_SIZE,
                                           shuffle=False, verbose = 0)
        
        if self.episodes % UPDATE_AFTER_EPISODES == 0:
            self.future_q_model.set_weights(self.present_q_model.get_weights())
                
    def next_action(self):
        q_values = self.present_q_model.predict([self.state]).tolist()
        
        if np.random.random() > self.epsilon:
            action = np.argmax(q_values)
        else:
            action = np.random.randint(0, env.action_space.n)
            
        new_state, reward, done, info = env.step(action)
        self.score += 1
        
        if done:
            self.episodes +=1
            if self.episodes % RENDER_AFTER_EPISODES == 0:
                self.render = True
            else:
                self.render = False
                
            if self.episodes % AVG_OF_LAST == 0:
                self.avg_reward = self.score/AVG_OF_LAST
                print(f"Average Score = {self.avg_reward}")
                self.score = 0
                        
        if self.render:
            #env.render()
            None
            
            
        self.memory.append([action, self.state, reward, new_state.tolist(), done])
        self.state = new_state.tolist() if not done else env.reset().tolist()
        
        self.train()
        if self.epsilon > MIN_EPSILON:
            self.epsilon *= EPSILON_DECAY
            self.epsilon = max(MIN_EPSILON, self.epsilon)
        
        return done
        

In [None]:
import gym
env = gym.make('CartPole-v1')
agent = Agent()
for episode in tqdm(range(TOTAL_EPISODES)):
    while True:
        stop = agent.next_action()
        if stop:
            if agent.avg_reward >= SAVE_AT_AVG:
                if ONLINE:
                    _ = agent.present_q_model.save(f"/content/drive/My Drive/model-{TASK}-{agent.avg_reward}.h5")
                else:
                    _ = agent.present_q_model.save(f"model-{TASK}-{agent.avg_reward}.h5")
            break

CartPoleEnv - Version 0.2.0, Noise case: 1
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               640       
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 130       
Total params: 9,026
Trainable params: 9,026
Non-trainable params: 0
_________________________________________________________________
None
Model: "sequential_1"
__________________________________________________

  0%|          | 0/1000 [00:00<?, ?it/s]

dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 130       
Total params: 9,026
Trainable params: 9,026
Non-trainable params: 0
_________________________________________________________________
None


  6%|▌         | 60/1000 [10:06<12:31:18, 47.96s/it]