In [6]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.models import load_model
from collections import deque
import numpy as np
from tqdm import tqdm
import random
import os
import matplotlib.pyplot as plt
%matplotlib
tf.compat.v1.disable_eager_execution()

Using matplotlib backend: Qt5Agg


In [7]:
#Important Parameters
AGENT_MEMORY = 50000
UPDATE_AFTER_EPISODES = 5
HIDDEN_LAYERS= [64, 128]
BATCH_SIZE = 64
DISCOUNT = 0.997
LEARNING_RATE = 0.001
LOSS_FUNCTION = 'mse'
AVG_OF_LAST = 100
TASK = 3

#Unimportant Parameters
OPTIMIZER = Adam(lr=LEARNING_RATE)
RENDER_AFTER_EPISODES = 25
q_new = lambda q_max, reward: (reward + DISCOUNT * q_max)
TOTAL_EPISODES = 1000
MAX_EPSILON = 1
EPSILON_DECAY = 0.9975
MIN_EPSILON = 0.0005
DROP_PRECENT = 0.25
PLAY = True
PLAY_EPISODES = 100
SHOW = False
PLOT = True

#Kernel Parameters
SAVE_AT_AVG = 400
ONLINE = False
TPU = True
if ONLINE:
    from google.colab import drive
    drive.mount("/content/drive")

#User Settings
ONLINE_PATH1 = f"/content/drive/My\ Drive/task{TASK}.py"
ONLINE_PATH2 = "/usr/local/lib/python3.6/dist-packages/gym/envs/classic_control/cartpole.py"
ONLINE_SAVE_PATH = "/content/drive/My Drive"
OFFLINE_PATH1 = f"/home/anany/ML_project/task{TASK}.py"
OFFLINE_PATH2 = "/home/anany/anaconda3/envs/ML/lib/python3.6/site-packages/gym/envs/classic_control/cartpole.py"
OFFLINE_SAVE_PATH = "/home/anany/ML_project"
PATH1 = ONLINE_PATH1 if ONLINE else OFFLINE_PATH1
PATH2 = ONLINE_PATH2 if ONLINE else OFFLINE_PATH2
SAVE = ONLINE_SAVE_PATH if ONLINE else OFFLINE_SAVE_PATH
os.system('cp '+ PATH1 + ' ' + PATH2)
with open(PATH2, 'r') as f:
    print(f"Task file --> {f.readline()}")

Task file --> #TASK 3



In [8]:
if TPU and ONLINE:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.experimental.TPUStrategy(resolver)

In [9]:
class Agent:

    def __init__(self):
        self.state = env.reset().tolist()
        self.done = False
        self.total_score = 0
        self.score_per_episode = 0
        self.all_scores = [0]
        self.render = True
        self.epsilon = MAX_EPSILON
        self.avg_score = 0
        self.present_q_model = Sequential()
        self.present_q_model.add(Dense(HIDDEN_LAYERS[0],
                                       input_shape=(env.observation_space.low.size,),
                                       activation='relu'))
        self.present_q_model.add(Dropout(DROP_PRECENT))
        for index in range(1, len(HIDDEN_LAYERS)):
            self.present_q_model.add(Dense(HIDDEN_LAYERS[index],activation='relu'))
            self.present_q_model.add(Dropout(DROP_PRECENT))
        self.present_q_model.add(Dense(env.action_space.n,activation='linear'))
        self.present_q_model.compile(loss=LOSS_FUNCTION, optimizer=OPTIMIZER,
                                     metrics=['accuracy'])
        
        self.future_q_model = Sequential()
        self.future_q_model.add(Dense(HIDDEN_LAYERS[0],
                                      input_shape=(env.observation_space.low.size,),
                                      activation='relu'))
        self.future_q_model.add(Dropout(DROP_PRECENT))
        for index in range(1, len(HIDDEN_LAYERS)):
            self.future_q_model.add(Dense(HIDDEN_LAYERS[index],activation='relu'))
            self.future_q_model.add(Dropout(DROP_PRECENT))
        self.future_q_model.add(Dense(env.action_space.n,activation='linear'))
        self.future_q_model.compile(loss=LOSS_FUNCTION, optimizer=OPTIMIZER, 
                                    metrics=['accuracy'])

        self.future_q_model.set_weights(self.present_q_model.get_weights())
        
        self.memory = deque(maxlen=AGENT_MEMORY)
        self.episodes = 0
        print(self.present_q_model.summary())
        print(self.future_q_model.summary())
        
    def train(self):
        if len(self.memory) < BATCH_SIZE :
            return
        
        batch = random.sample(self.memory, BATCH_SIZE)
        present_q_values = self.present_q_model.predict(np.array([x[1] for x in batch]))
        #future_q2_values = self.present_q_model.predict(np.array([x[3] for x in batch]))
        future_q_values = self.future_q_model.predict(np.array([x[3] for x in batch]))
        for index, slot in enumerate(batch):
            if not slot[4]:
                #action_pred = np.argmax(future_q2_values[index])
                #q_future_max = future_q_values[index][action_pred]
                q_future_max = np.max(future_q_values[index])
                qnew = q_new(q_future_max, slot[2])
            else:
                qnew = slot[2]
            
            present_q_values[index][slot[0]] = qnew
            
        X = np.array([slot[1] for slot in batch])
        Y = present_q_values
        
        history = self.present_q_model.fit(X, Y, batch_size=BATCH_SIZE,
                                           shuffle=False, verbose = 0)
        
        if self.episodes % UPDATE_AFTER_EPISODES == 0:
            self.future_q_model.set_weights(self.present_q_model.get_weights())
                
    def next_action(self):
        q_values = self.present_q_model.predict(np.array([self.state]))
        
        if np.random.random() > self.epsilon:
            action = np.argmax(q_values)
        else:
            action = np.random.randint(0, env.action_space.n)
            
        new_state, reward, done, info = env.step(action)
        self.total_score += reward
        self.score_per_episode += reward
        
        if done:
            self.all_scores.append(self.score_per_episode)
            self.score_per_episode = 0
            if PLOT:
                plt.plot(self.all_scores,'g-')
                plt.pause(0.05)
            self.episodes +=1
            if self.episodes % RENDER_AFTER_EPISODES == 0:
                self.render = True
            else:
                self.render = False
                
            if self.episodes % AVG_OF_LAST == 0:
                self.avg_score = self.total_score/AVG_OF_LAST
                print(f"Average Score = {self.avg_score}")
                self.total_score = 0
                        
        if self.render:
            #env.render()
            None
            
            
        self.memory.append([action, self.state, reward, new_state, done])
        self.state = new_state.tolist() if not done else env.reset().tolist()
        
        self.train()
        if self.epsilon > MIN_EPSILON:
            self.epsilon *= EPSILON_DECAY
            self.epsilon = max(MIN_EPSILON, self.epsilon)
        
        return done
        

In [None]:
import gym
env = gym.make('CartPole-v1')
agent = Agent()
for episode in tqdm(range(TOTAL_EPISODES)):
    while True:
        stop = agent.next_action()
        if stop:
            if agent.avg_score >= SAVE_AT_AVG:
                agent.present_q_model.save(f"{SAVE}/model-{TASK}-{agent.avg_score}.h5")
            break

CartPoleEnv - Version 0.2.0, Noise case: 1



  0%|          | 0/1000 [00:00<?, ?it/s][A

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 64)                320       
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 258       
Total params: 8,898
Trainable params: 8,898
Non-trainable params: 0
_________________________________________________________________
None
Model: "sequential_3"
_________________________________________________________________
Layer (type)             


  0%|          | 1/1000 [00:00<04:52,  3.41it/s][A
  0%|          | 2/1000 [00:01<09:09,  1.82it/s][A
  0%|          | 3/1000 [00:01<08:24,  1.98it/s][A
  0%|          | 4/1000 [00:02<08:56,  1.86it/s][A
  0%|          | 5/1000 [00:03<09:33,  1.73it/s][A
  1%|          | 6/1000 [00:03<09:48,  1.69it/s][A
  1%|          | 7/1000 [00:04<09:14,  1.79it/s][A
  1%|          | 8/1000 [00:04<08:37,  1.92it/s][A
  1%|          | 9/1000 [00:05<08:16,  1.99it/s][A
  1%|          | 10/1000 [00:05<07:47,  2.12it/s][A
  1%|          | 11/1000 [00:06<08:02,  2.05it/s][A
  1%|          | 12/1000 [00:06<07:09,  2.30it/s][A
  1%|▏         | 13/1000 [00:06<07:47,  2.11it/s][A
  1%|▏         | 14/1000 [00:07<08:30,  1.93it/s][A
  2%|▏         | 15/1000 [00:07<08:00,  2.05it/s][A
  2%|▏         | 16/1000 [00:08<09:07,  1.80it/s][A
  2%|▏         | 17/1000 [00:09<09:01,  1.82it/s][A
  2%|▏         | 18/1000 [00:09<08:06,  2.02it/s][A
  2%|▏         | 19/1000 [00:09<07:19,  2.23it/s][A
 

Average Score = 55.89



 10%|█         | 101/1000 [03:02<57:35,  3.84s/it][A
 10%|█         | 102/1000 [03:06<55:12,  3.69s/it][A
 10%|█         | 103/1000 [03:10<57:16,  3.83s/it][A
 10%|█         | 104/1000 [03:17<1:10:50,  4.74s/it][A
 10%|█         | 105/1000 [03:19<1:01:50,  4.15s/it][A
 11%|█         | 106/1000 [03:26<1:12:56,  4.90s/it][A
 11%|█         | 107/1000 [03:28<59:29,  4.00s/it]  [A
 11%|█         | 108/1000 [03:31<55:10,  3.71s/it][A
 11%|█         | 109/1000 [03:34<53:51,  3.63s/it][A
 11%|█         | 110/1000 [03:38<55:05,  3.71s/it][A
 11%|█         | 111/1000 [03:46<1:10:21,  4.75s/it][A
 11%|█         | 112/1000 [03:52<1:19:40,  5.38s/it][A
 11%|█▏        | 113/1000 [03:58<1:20:36,  5.45s/it][A
 11%|█▏        | 114/1000 [04:02<1:12:45,  4.93s/it][A
 12%|█▏        | 115/1000 [04:06<1:10:27,  4.78s/it][A
 12%|█▏        | 116/1000 [04:12<1:13:06,  4.96s/it][A
 12%|█▏        | 117/1000 [04:18<1:17:53,  5.29s/it][A
 12%|█▏        | 118/1000 [04:21<1:09:45,  4.75s/it][A
 12

Average Score = 179.15



 20%|██        | 201/1000 [12:33<56:21,  4.23s/it][A
 20%|██        | 202/1000 [12:35<46:59,  3.53s/it][A
 20%|██        | 203/1000 [12:41<56:15,  4.24s/it][A
 20%|██        | 204/1000 [12:43<50:18,  3.79s/it][A
 20%|██        | 205/1000 [12:51<1:05:14,  4.92s/it][A
 21%|██        | 206/1000 [12:55<1:00:24,  4.56s/it][A
 21%|██        | 207/1000 [13:01<1:04:55,  4.91s/it][A
 21%|██        | 208/1000 [13:19<1:59:34,  9.06s/it][A
 21%|██        | 209/1000 [13:24<1:42:54,  7.81s/it][A
 21%|██        | 210/1000 [13:29<1:32:26,  7.02s/it][A
 21%|██        | 211/1000 [13:39<1:41:58,  7.75s/it][A
 21%|██        | 212/1000 [13:44<1:33:06,  7.09s/it][A
 21%|██▏       | 213/1000 [13:48<1:17:45,  5.93s/it][A
 21%|██▏       | 214/1000 [13:55<1:24:49,  6.48s/it][A
 22%|██▏       | 215/1000 [14:11<2:00:23,  9.20s/it][A
 22%|██▏       | 216/1000 [14:17<1:50:06,  8.43s/it][A
 22%|██▏       | 217/1000 [14:22<1:33:28,  7.16s/it][A
 22%|██▏       | 218/1000 [14:25<1:18:33,  6.03s/it][A

In [None]:
#PLEASE RUN THE FIRST TWO CODE BLOCKS BEFORE RUNNING THIS BLOCK
if PLAY:
    import gym
    env = gym.make('CartPole-v1')
    score = input("Enter the score of the model you want to load ")
    trained_agent = load_model(f"{SAVE}/model-{TASK}-{score}.h5")
    state = env.reset()
    avg_score = 0
    for _ in tqdm(range(PLAY_EPISODES)):
        env.reset()
        while True:
            action = np.argmax(trained_agent.predict([state.tolist()]))
            state, reward, done, info = env.step(action)
            avg_score += reward
            if SHOW:
                env.render()
            if done:
                break
    print(avg_score/PLAY_EPISODES)