In [8]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.models import load_model
from collections import deque
import numpy as np
from tqdm import tqdm
import random
import os
import matplotlib.pyplot as plt
%matplotlib
tf.compat.v1.disable_eager_execution()

Using matplotlib backend: Qt5Agg


In [9]:
#Important Parameters
AGENT_MEMORY = 20000
UPDATE_AFTER_EPISODES = 5
HIDDEN_LAYERS= [64, 128]
BATCH_SIZE = 32
DISCOUNT = 0.997
LEARNING_RATE = 0.001
LOSS_FUNCTION = 'mse'
AVG_OF_LAST = 100
TASK = 1

#Unimportant Parameters
OPTIMIZER = Adam(lr=LEARNING_RATE)
RENDER_AFTER_EPISODES = 25
q_new = lambda q_max, reward: (reward + DISCOUNT * q_max)
TOTAL_EPISODES = 1000
MAX_EPSILON = 1
EPSILON_DECAY = 0.9975
MIN_EPSILON = 0.0005
DROP_PRECENT = 0.25
PLAY = True
PLAY_EPISODES = 100
SHOW = False
PLOT = True
TRAIN = True

#Kernel Parameters
SAVE_AT_AVG = 400
ONLINE = False
TPU = True
if ONLINE:
    from google.colab import drive
    drive.mount("/content/drive")

#User Settings
ONLINE_PATH1 = f"/content/drive/My\ Drive/task{TASK}.py"
ONLINE_PATH2 = "/usr/local/lib/python3.6/dist-packages/gym/envs/classic_control/cartpole.py"
ONLINE_SAVE_PATH = "/content/drive/My Drive"
OFFLINE_PATH1 = f"/home/anany/ML_project/task{TASK}.py"
OFFLINE_PATH2 = "/home/anany/anaconda3/envs/ML/lib/python3.6/site-packages/gym/envs/classic_control/cartpole.py"
OFFLINE_SAVE_PATH = "/home/anany/ML_project"
PATH1 = ONLINE_PATH1 if ONLINE else OFFLINE_PATH1
PATH2 = ONLINE_PATH2 if ONLINE else OFFLINE_PATH2
SAVE = ONLINE_SAVE_PATH if ONLINE else OFFLINE_SAVE_PATH
os.system('cp '+ PATH1 + ' ' + PATH2)
with open(PATH2, 'r') as f:
    print(f"Task file --> {f.readline()}")

Task file --> #TASK 1



In [15]:
def plot(scores=[]):
    if PLOT:
        plt.clf()
        avg_scores = [sum(scores[:index+1])/(index+1) for index, 
                      score in enumerate(scores)]
        avg_score = sum(scores)/len(scores)
        x_val = [0,len(scores)-1]
        y_val = [avg_score, avg_score]
        plt.plot(scores,'g-o', label='current score')
        plt.plot(x_val, y_val,'r-', label='average score')
        plt.plot(avg_scores,'b-', label='average scores')
        if len(scores) >= AVG_OF_LAST:
            avg_of_last = sum(scores[-1*AVG_OF_LAST:])/AVG_OF_LAST
            y_val = [avg_of_last, avg_of_last]
            plt.plot(x_val, y_val, 'k-', label=f'average scores of last {AVG_OF_LAST}')
        plt.xlabel('Epochs--->')
        plt.ylabel('Score--->')
        plt.legend()
        plt.pause(0.05)

In [11]:
if TPU and ONLINE:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.experimental.TPUStrategy(resolver)

In [12]:
class Agent:

    def __init__(self):
        self.state = env.reset().tolist()
        self.done = False
        self.total_score = 0
        self.score_per_episode = 0
        self.all_scores = [0]
        self.render = True
        self.epsilon = MAX_EPSILON
        self.avg_score = 0
        self.present_q_model = Sequential()
        self.present_q_model.add(Dense(HIDDEN_LAYERS[0],
                                       input_shape=(env.observation_space.low.size,),
                                       activation='relu'))
        self.present_q_model.add(Dropout(DROP_PRECENT))
        for index in range(1, len(HIDDEN_LAYERS)):
            self.present_q_model.add(Dense(HIDDEN_LAYERS[index],activation='relu'))
            self.present_q_model.add(Dropout(DROP_PRECENT))
        self.present_q_model.add(Dense(env.action_space.n,activation='linear'))
        self.present_q_model.compile(loss=LOSS_FUNCTION, optimizer=OPTIMIZER,
                                     metrics=['accuracy'])
        
        self.future_q_model = Sequential()
        self.future_q_model.add(Dense(HIDDEN_LAYERS[0],
                                      input_shape=(env.observation_space.low.size,),
                                      activation='relu'))
        self.future_q_model.add(Dropout(DROP_PRECENT))
        for index in range(1, len(HIDDEN_LAYERS)):
            self.future_q_model.add(Dense(HIDDEN_LAYERS[index],activation='relu'))
            self.future_q_model.add(Dropout(DROP_PRECENT))
        self.future_q_model.add(Dense(env.action_space.n,activation='linear'))
        self.future_q_model.compile(loss=LOSS_FUNCTION, optimizer=OPTIMIZER, 
                                    metrics=['accuracy'])

        self.future_q_model.set_weights(self.present_q_model.get_weights())
        
        self.memory = deque(maxlen=AGENT_MEMORY)
        self.episodes = 0
        print(self.present_q_model.summary())
        print(self.future_q_model.summary())
        
    def train(self):
        if len(self.memory) < BATCH_SIZE :
            return
        
        batch = random.sample(self.memory, BATCH_SIZE)
        present_q_values = self.present_q_model.predict(np.array([x[1] for x in batch]))
        #future_q2_values = self.present_q_model.predict(np.array([x[3] for x in batch]))
        future_q_values = self.future_q_model.predict(np.array([x[3] for x in batch]))
        for index, slot in enumerate(batch):
            if not slot[4]:
                #action_pred = np.argmax(future_q2_values[index])
                #q_future_max = future_q_values[index][action_pred]
                q_future_max = np.max(future_q_values[index])
                qnew = q_new(q_future_max, slot[2])
            else:
                qnew = slot[2]
            
            present_q_values[index][slot[0]] = qnew
            
        X = np.array([slot[1] for slot in batch])
        Y = present_q_values
        
        history = self.present_q_model.fit(X, Y, batch_size=BATCH_SIZE,
                                           shuffle=False, verbose = 0)
        
        if self.episodes % UPDATE_AFTER_EPISODES == 0:
            self.future_q_model.set_weights(self.present_q_model.get_weights())
                
    def next_action(self):
        q_values = self.present_q_model.predict(np.array([self.state]))
        
        if np.random.random() > self.epsilon:
            action = np.argmax(q_values)
        else:
            action = np.random.randint(0, env.action_space.n)
            
        new_state, reward, done, info = env.step(action)
        self.total_score += reward
        self.score_per_episode += reward
        
        if done:
            self.all_scores.append(self.score_per_episode)
            self.score_per_episode = 0
            if PLOT:
                plot(self.all_scores)
            self.episodes +=1
            if self.episodes % RENDER_AFTER_EPISODES == 0:
                self.render = True
            else:
                self.render = False
                
            if self.episodes % AVG_OF_LAST == 0:
                self.avg_score = self.total_score/AVG_OF_LAST
                print(f"Average Score = {self.avg_score}")
                self.total_score = 0
                        
        if self.render:
            #env.render()
            None
            
            
        self.memory.append([action, self.state, reward, new_state, done])
        self.state = new_state.tolist() if not done else env.reset()
        
        self.train()
        if self.epsilon > MIN_EPSILON:
            self.epsilon *= EPSILON_DECAY
            self.epsilon = max(MIN_EPSILON, self.epsilon)
        
        return done
        

In [13]:
if TRAIN:
    import gym
    env = gym.make('CartPole-v1')
    agent = Agent()
    max_score = 0
    max_avg = 0
    for episode in tqdm(range(TOTAL_EPISODES)):
        while True:
            stop = agent.next_action()
            if stop:
                if max_score <= agent.all_scores[-1]:
                    os.system('rm '+ f"{SAVE}/peak_model-{TASK}-{max_score}.h5")
                    try:
                        max_score = agent.all_scores[-1]
                        agent.present_q_model.save(f"{SAVE}/peak_model-{TASK}-{max_score}.h5")
                    except KeyboardInterrupt:
                        max_score = agent.all_scores[-1]
                        agent.present_q_model.save(f"{SAVE}/peak_model-{TASK}-{max_score}.h5")
                        raise KeyboardInterrupt
                if episode >= AVG_OF_LAST:
                    max_avg = max(sum(agent.all_scores[-1*AVG_OF_LAST:])/AVG_OF_LAST, max_avg)
                    if max_avg >= SAVE_AT_AVG:
                        agent.present_q_model.save(f"{SAVE}/avg_model-{TASK}-{max_avg}.h5")
                        print("saved")
                break

CartPoleEnv - Version 0.2.0, Noise case: 1




  0%|          | 0/1000 [00:00<?, ?it/s][A[A

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 64)                320       
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 258       
Total params: 8,898
Trainable params: 8,898
Non-trainable params: 0
_________________________________________________________________
None
Model: "sequential_3"
_________________________________________________________________
Layer (type)             



  0%|          | 1/1000 [00:00<06:38,  2.51it/s][A[A

  0%|          | 2/1000 [00:01<09:38,  1.72it/s][A[A

  0%|          | 3/1000 [00:01<08:06,  2.05it/s][A[A

  0%|          | 4/1000 [00:01<07:07,  2.33it/s][A[A

  0%|          | 5/1000 [00:02<07:20,  2.26it/s][A[A

  1%|          | 6/1000 [00:02<06:27,  2.57it/s][A[A

  1%|          | 7/1000 [00:02<05:31,  3.00it/s][A[A

  1%|          | 8/1000 [00:03<05:13,  3.17it/s][A[A

  1%|          | 9/1000 [00:03<04:36,  3.58it/s][A[A

  1%|          | 10/1000 [00:03<04:16,  3.86it/s][A[A

  1%|          | 11/1000 [00:03<04:16,  3.85it/s][A[A

  1%|          | 12/1000 [00:04<04:03,  4.06it/s][A[A

  1%|▏         | 13/1000 [00:04<03:46,  4.36it/s][A[A

  1%|▏         | 14/1000 [00:04<04:27,  3.68it/s][A[A

  2%|▏         | 15/1000 [00:04<04:25,  3.71it/s][A[A

  2%|▏         | 16/1000 [00:05<04:40,  3.51it/s][A[A

  2%|▏         | 17/1000 [00:05<04:17,  3.81it/s][A[A

  2%|▏         | 18/1000 [00:05<04:06,

Average Score = 50.08




 10%|█         | 101/1000 [00:58<23:10,  1.55s/it][A[A

 10%|█         | 102/1000 [01:00<22:10,  1.48s/it][A[A

 10%|█         | 103/1000 [01:01<21:36,  1.45s/it][A[A

 10%|█         | 104/1000 [01:02<19:31,  1.31s/it][A[A

 10%|█         | 105/1000 [01:03<18:14,  1.22s/it][A[A

 11%|█         | 106/1000 [01:04<17:14,  1.16s/it][A[A

 11%|█         | 107/1000 [01:05<17:54,  1.20s/it][A[A

 11%|█         | 108/1000 [01:06<17:19,  1.17s/it][A[A

 11%|█         | 109/1000 [01:08<18:03,  1.22s/it][A[A

 11%|█         | 110/1000 [01:10<21:19,  1.44s/it][A[A

 11%|█         | 111/1000 [01:12<24:05,  1.63s/it][A[A

 11%|█         | 112/1000 [01:13<24:54,  1.68s/it][A[A

 11%|█▏        | 113/1000 [01:15<22:09,  1.50s/it][A[A

 11%|█▏        | 114/1000 [01:16<23:10,  1.57s/it][A[A

 12%|█▏        | 115/1000 [01:17<20:20,  1.38s/it][A[A

 12%|█▏        | 116/1000 [01:19<22:35,  1.53s/it][A[A

 12%|█▏        | 117/1000 [01:20<20:27,  1.39s/it][A[A

 12%|█▏     

Average Score = 154.72




 20%|██        | 201/1000 [03:25<14:43,  1.11s/it][A[A

 20%|██        | 202/1000 [03:26<14:20,  1.08s/it][A[A

 20%|██        | 203/1000 [03:27<14:03,  1.06s/it][A[A

 20%|██        | 204/1000 [03:29<14:18,  1.08s/it][A[A

 20%|██        | 205/1000 [03:30<13:43,  1.04s/it][A[A

 21%|██        | 206/1000 [03:31<14:04,  1.06s/it][A[A

 21%|██        | 207/1000 [03:32<14:11,  1.07s/it][A[A

 21%|██        | 208/1000 [03:33<13:52,  1.05s/it][A[A

 21%|██        | 209/1000 [03:34<13:27,  1.02s/it][A[A

 21%|██        | 210/1000 [03:35<13:04,  1.01it/s][A[A

 21%|██        | 211/1000 [03:36<13:48,  1.05s/it][A[A

 21%|██        | 212/1000 [03:37<13:46,  1.05s/it][A[A

 21%|██▏       | 213/1000 [03:38<15:18,  1.17s/it][A[A

 21%|██▏       | 214/1000 [03:39<14:54,  1.14s/it][A[A

 22%|██▏       | 215/1000 [03:40<14:30,  1.11s/it][A[A

 22%|██▏       | 216/1000 [03:42<14:29,  1.11s/it][A[A

 22%|██▏       | 217/1000 [03:43<14:19,  1.10s/it][A[A

 22%|██▏    

Average Score = 118.87




 30%|███       | 301/1000 [05:22<04:18,  2.70it/s][A[A

 30%|███       | 302/1000 [05:22<05:36,  2.08it/s][A[A

 30%|███       | 303/1000 [05:23<04:46,  2.44it/s][A[A

 30%|███       | 304/1000 [05:23<04:52,  2.38it/s][A[A

 30%|███       | 305/1000 [05:23<04:07,  2.81it/s][A[A

 31%|███       | 306/1000 [05:24<04:02,  2.86it/s][A[A

 31%|███       | 307/1000 [05:24<03:27,  3.34it/s][A[A

 31%|███       | 308/1000 [05:24<03:12,  3.59it/s][A[A

 31%|███       | 309/1000 [05:24<02:56,  3.92it/s][A[A

 31%|███       | 310/1000 [05:24<02:52,  4.01it/s][A[A

 31%|███       | 311/1000 [05:25<02:44,  4.20it/s][A[A

 31%|███       | 312/1000 [05:25<02:40,  4.28it/s][A[A

 31%|███▏      | 313/1000 [05:25<02:28,  4.61it/s][A[A

 31%|███▏      | 314/1000 [05:25<02:28,  4.61it/s][A[A

 32%|███▏      | 315/1000 [05:25<02:27,  4.66it/s][A[A

 32%|███▏      | 316/1000 [05:26<02:28,  4.60it/s][A[A

 32%|███▏      | 317/1000 [05:26<02:22,  4.81it/s][A[A

 32%|███▏   

Average Score = 194.88




 40%|████      | 401/1000 [09:10<15:16,  1.53s/it][A[A

 40%|████      | 402/1000 [09:11<15:38,  1.57s/it][A[A

 40%|████      | 403/1000 [09:12<14:21,  1.44s/it][A[A

 40%|████      | 404/1000 [09:16<20:20,  2.05s/it][A[A

 40%|████      | 405/1000 [09:17<18:57,  1.91s/it][A[A

 41%|████      | 406/1000 [09:21<24:02,  2.43s/it][A[A

 41%|████      | 407/1000 [09:21<18:01,  1.82s/it][A[A

 41%|████      | 408/1000 [09:22<13:31,  1.37s/it][A[A

 41%|████      | 409/1000 [09:23<13:18,  1.35s/it][A[A

 41%|████      | 410/1000 [09:25<15:26,  1.57s/it][A[A

 41%|████      | 411/1000 [09:27<14:54,  1.52s/it][A[A

 41%|████      | 412/1000 [09:29<16:27,  1.68s/it][A[A

 41%|████▏     | 413/1000 [09:30<15:40,  1.60s/it][A[A

 41%|████▏     | 414/1000 [09:31<12:53,  1.32s/it][A[A

 42%|████▏     | 415/1000 [09:33<15:45,  1.62s/it][A[A

 42%|████▏     | 416/1000 [09:35<17:01,  1.75s/it][A[A

 42%|████▏     | 417/1000 [09:37<16:52,  1.74s/it][A[A

 42%|████▏  

KeyboardInterrupt: 

In [16]:
if PLAY:
    import gym
    env = gym.make('CartPole-v1')
    score = input("Enter the score of the model you want to load ")
    model_type = input("Enter model type peak_model or avg_model ")
    trained_agent = load_model(f"{SAVE}/{model_type}-{TASK}-{score}.h5")
    state = env.reset()
    scores = []
    for _ in tqdm(range(PLAY_EPISODES)):
        env.reset()
        score_per_episode = 0
        while True:
            action = np.argmax(trained_agent.predict(np.array([state])))
            state, reward, done, info = env.step(action)
            score_per_episode += reward
            if SHOW:
                env.render()
            if done:
                scores.append(score_per_episode)
                break
        if PLOT:
            plot(scores)
    print(sum(scores)/PLAY_EPISODES)

CartPoleEnv - Version 0.2.0, Noise case: 1
Enter the score of the model you want to load 500.0
Enter model type peak_model or avg_model peak_model






  0%|          | 0/100 [00:00<?, ?it/s][A[A[A[A



  1%|          | 1/100 [00:01<02:27,  1.49s/it][A[A[A[A



  2%|▏         | 2/100 [00:02<02:18,  1.41s/it][A[A[A[A



  3%|▎         | 3/100 [00:04<02:14,  1.39s/it][A[A[A[A



  4%|▍         | 4/100 [00:05<02:07,  1.33s/it][A[A[A[A



  5%|▌         | 5/100 [00:06<02:00,  1.27s/it][A[A[A[A



  6%|▌         | 6/100 [00:07<01:55,  1.23s/it][A[A[A[A



  7%|▋         | 7/100 [00:08<01:52,  1.20s/it][A[A[A[A



  8%|▊         | 8/100 [00:09<01:49,  1.19s/it][A[A[A[A



  9%|▉         | 9/100 [00:10<01:46,  1.17s/it][A[A[A[A



 10%|█         | 10/100 [00:12<01:44,  1.16s/it][A[A[A[A



 11%|█         | 11/100 [00:13<01:42,  1.15s/it][A[A[A[A



 12%|█▏        | 12/100 [00:14<01:41,  1.15s/it][A[A[A[A



 13%|█▎        | 13/100 [00:15<01:40,  1.15s/it][A[A[A[A



 14%|█▍        | 14/100 [00:16<01:39,  1.15s/it][A[A[A[A



 15%|█▌        | 15/100 [00:17<01:37,  1.15s/it][A[A

500.0



