In [1]:
# This is for the cart-pole game, it's all about my experiments with the game

In [2]:
# After 500 time steps the game automaticly terminates

import time
import random
import heapq as hp
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

EPISODES = 1000


class DQNAgent:
    def __init__(self, state_size, action_size,epsilon_decay=.99,gamma=0.95):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = {}
        self.pqt=[]
        self.gamma = gamma   # discount rate 0.95
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = epsilon_decay  #0.99
        self.learning_rate = 0.001  # the learning rate
        self.model = self._build_model()
        self.pev_model=self.model        
        
    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        cort_rwd=(reward + self.gamma *
                          (self.pev_model.predict(next_state)[0])[self.act_unbaised(next_state)])
        prd_rwd=self.model.predict(next_state)[0][action]
        hp.heappush(self.pqt,-abs(float(cort_rwd - prd_rwd)))
        self.memory[-abs(float(cort_rwd - prd_rwd))]=(state, action, reward, next_state, done)
        
    
    def act_unbaised(self,state):
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
        
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action
    
    def replay(self, batch_size):
        
        for _ in range(batch_size):
            
            state, action, reward, next_state, done = self.memory[hp.heappop(self.pqt)]
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.pev_model.predict(next_state)[0]))  # q(s,a) = r + max(q(_s,_a)) over _a
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)


if __name__ == "__main__":
    env = gym.make('CartPole-v1')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    time_start=time.time()
    
    agent = DQNAgent(state_size, action_size)
    # agent.load("./save/cartpole-dqn.h5")
    done = False
    batch_size = 64
    vally=0
    for e in range(EPISODES):
        
        done=False
        state = env.reset()  # It's obdervation 'o'
        state = np.reshape(state, [1, state_size]) # Encapsulating whole thing into and array i.e [[1,12,2,3]]
        for t in range(500):      # The 1st comment, actually 'for' loop will work too
            
            env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            
            if done:
                print("episode: {}/{}, score: {}, e: {:.2}"
                      .format(e, EPISODES, t, agent.epsilon))
                break
        if t>=499 : 
            vally+=1
            print("Time for which pole stand:",t,vally)
            if vally==10:
                break
                print("Done after episode:",e-1)
        if len(agent.pqt) > batch_size:
            agent.replay(batch_size)
        agent.pev_model=agent.model
    end_time=time.time()
    print(end_time-time_start)
        # if e % 10 == 0:
#     agent.save("./save/cartpole-dqn.h5")

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
episode: 0/1000, score: 18, e: 1.0
episode: 1/1000, score: 27, e: 1.0
episode: 2/1000, score: 24, e: 1.0
episode: 3/1000, score: 17, e: 0.99
episode: 4/1000, score: 12, e: 0.99
episode: 5/1000, score: 23, e: 0.99
episode: 6/1000, score: 14, e: 0.99
episode: 7/1000, score: 18, e: 0.98
episode: 8/1000, score: 11, e: 0.98
episode: 9/1000, score: 26, e: 0.98
episode: 10/1000, score: 17, e: 0.97
episode: 11/1000, score: 27, e: 0.97
episode: 12/1000, score: 11, e: 0.97
episode: 13/1000, score: 50, e: 0.96
episode: 14/1000, score: 12, e: 0.96
episode: 15/1000, score: 24, e: 0.95
episode: 16/1000, score: 15, e: 0.95
episode: 17/1000, score: 29, e: 0.95
episode: 18/1000, score: 34, e: 0.94
episode: 19/1000, score: 16, e: 0.94
episode: 20/1000, score: 10, e: 0.94
episode: 21/1000, score: 17, e: 0.93
episode: 22/1000, score: 9, e: 0.93
episode: 23/1000, score: 19, e: 0.93
episode: 24/1000, 

episode: 216/1000, score: 204, e: 0.36
episode: 217/1000, score: 205, e: 0.35
episode: 218/1000, score: 143, e: 0.35
episode: 219/1000, score: 142, e: 0.34
episode: 220/1000, score: 385, e: 0.34
episode: 221/1000, score: 499, e: 0.34
Time for which pole stand: 499 1
episode: 222/1000, score: 165, e: 0.33
episode: 223/1000, score: 190, e: 0.33
episode: 224/1000, score: 246, e: 0.33
episode: 225/1000, score: 216, e: 0.32
episode: 226/1000, score: 146, e: 0.32
episode: 227/1000, score: 186, e: 0.32
episode: 228/1000, score: 169, e: 0.31
episode: 229/1000, score: 276, e: 0.31
episode: 230/1000, score: 76, e: 0.31
episode: 231/1000, score: 306, e: 0.31
episode: 232/1000, score: 318, e: 0.3
episode: 233/1000, score: 376, e: 0.3
episode: 234/1000, score: 383, e: 0.3
episode: 235/1000, score: 208, e: 0.29
episode: 236/1000, score: 299, e: 0.29
episode: 237/1000, score: 221, e: 0.29
episode: 238/1000, score: 499, e: 0.28
Time for which pole stand: 499 2
episode: 239/1000, score: 414, e: 0.28
ep

In [4]:
agent.memory.clear()
agent.pqt.clear()

In [2]:
#### Time took = 187.36419129371643 , 231.45736050605774
# After episode 367 time : 55.953306913375854 Avg evaluation : 479.72 for vally=5
"""For epsilon_Decay=0.98
    Episode 413 time 93.76821875572205
    Avg evavluation : 499
    for vally=10 time: 104.12895131111145"""
# now vally=10 
''' After episode 396 time : 101.89768052101135 Avg : 422.85,
    nxt time episode 386 time : 117.12235856056213 avg 485.68
    nxt time episode 503 time : 211.13185095787048 avg 438.84
# epsilon_decay=0.99
    nxt time episode 283 time : 58.04395508766174 avg  498.18
    nxt time episode 234 time : 46.76658082008362 avg  492.76
    nxt time episode 359 time : 96.6677758693695 avg  499.00
# epsilon_decay=0.995
    nxt time episode 395 time : 87.64711785316467 avg  491.93
    nxt time episode 479 time : 152.34307408332825 avg  490.36
    nxt time episode 397 time : 84.11703038215637 avg  499.00'''
# epsilon_decay=0.995
''' for the normal one: episode 541 Avg : 374.64 nxt time(same kernal) episode 394 avg 499
    nxt time(diff_kernal) episode : deoest converge avg:146.97
    nxt time(diff_kernal) episode : 720 avg:389.93
# epsilon_decay=0.99
    nxt time(diff_kernal) episode : 399 avg:290.88'''

' for the normal one: episode 541 Avg : 374.64 nxt time(same kernal) episode 394 avg 499\n    nxt time(diff_kernal) episode : deoest converge avg:146.97\n    nxt time(diff_kernal) episode : 720 avg:389.93\n# epsilon_decay=0.99\n    nxt time(diff_kernal) episode : 399 avg:290.88'

In [2]:
def policy_evaluation(env,agent):
    avg=0
    for i in range(100):
        #env.seed(i)
        state=env.reset()
        state=np.reshape(state,[1,agent.state_size])
        action=np.argmax(agent.model.predict(state)[0])
        nxt_state, reward, done, info = env.step(action)
        net_r=0
        while not done:
            net_r+=reward
            state=nxt_state
            state=np.reshape(state, [1,agent.state_size])
            action=np.argmax(agent.model.predict(state)[0])
            nxt_state, reward, done, info = env.step(action)
            #print(nxt_state, reward, done)
        avg+=net_r
        print("Reward for {} episode: {}".format(i,net_r))
    print("Avg reward: ",avg/(i+1) )
    env.close()

In [3]:
policy_evaluation(env,agent)

Reward for 0 episode: 165.0
Reward for 1 episode: 499.0
Reward for 2 episode: 458.0
Reward for 3 episode: 499.0
Reward for 4 episode: 179.0
Reward for 5 episode: 499.0
Reward for 6 episode: 499.0
Reward for 7 episode: 499.0
Reward for 8 episode: 297.0
Reward for 9 episode: 293.0
Reward for 10 episode: 499.0
Reward for 11 episode: 313.0
Reward for 12 episode: 499.0
Reward for 13 episode: 499.0
Reward for 14 episode: 357.0
Reward for 15 episode: 499.0
Reward for 16 episode: 467.0
Reward for 17 episode: 492.0
Reward for 18 episode: 335.0
Reward for 19 episode: 461.0
Reward for 20 episode: 304.0
Reward for 21 episode: 499.0
Reward for 22 episode: 499.0
Reward for 23 episode: 499.0
Reward for 24 episode: 499.0
Reward for 25 episode: 373.0
Reward for 26 episode: 499.0
Reward for 27 episode: 499.0
Reward for 28 episode: 481.0
Reward for 29 episode: 195.0
Reward for 30 episode: 499.0
Reward for 31 episode: 499.0
Reward for 32 episode: 304.0
Reward for 33 episode: 180.0
Reward for 34 episode: 4

In [4]:
import time
import random
import heapq as hp
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

EPISODES = 1000


class DQNAgent:
    def __init__(self, state_size, action_size,epsilon_decay,gamma):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = {}
        self.pqt=[]
        self.gamma = gamma   # discount rate 0.95
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = epsilon_decay  #0.99
        self.learning_rate = 0.001  # the learning rate
        self.model = self._build_model()
        self.pev_model=self.model        
        
    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        cort_rwd=(reward + self.gamma *
                          (self.pev_model.predict(next_state)[0])[self.act_unbaised(next_state)])
        prd_rwd=self.model.predict(next_state)[0][action]
        hp.heappush(self.pqt,-abs(float(cort_rwd - prd_rwd)))
        self.memory[-abs(float(cort_rwd - prd_rwd))]=(state, action, reward, next_state, done)
        
    
    def act_unbaised(self,state):
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
    
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action
    
    def replay(self, batch_size):
        
        for _ in range(batch_size):
            
            state, action, reward, next_state, done = self.memory[hp.heappop(self.pqt)]
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.pev_model.predict(next_state)[0]))  # q(s,a) = r + max(q(_s,_a)) over _a
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)


In [5]:
# After 500 time steps the game automaticly terminates



def do_it(agent):
    env.reset()
    time_start=time.time()
    
    
    # agent.load("./save/cartpole-dqn.h5")
    done = False
    batch_size = 32
    vally=[]
    for e in range(EPISODES):

        done=False
        state = env.reset()  # It's obdervation 'o'
        state = np.reshape(state, [1, state_size]) # Encapsulating whole thing into and array i.e [[1,12,2,3]]
        for t in range(500):      # The 1st comment, actually 'for' loop will work too
            
            env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = reward if not done else -10
            next_state = np.reshape(next_state, [1, state_size])
            agent.remember(state, action, reward, next_state, done)
            state = next_state
           # agent.pev_model=agent.model  # I think this should be episidic not for every t
            if done:
               # print("episode: {}/{}, score: {}, e: {:.2}"
                #      .format(e, EPISODES, t, agent.epsilon))
                break
        if t>=499 : 
            vally.append(e-1)
            #print("Time for which pole stand:",t,vally)
            if len(vally)==10:
                break
                
        if len(agent.pqt) > batch_size:
            agent.replay(batch_size)
        agent.pev_model=agent.model
    end_time=time.time()
    
    
    avg=0
    for i in range(100):
        #env.seed(i)
        state=env.reset()
        state=np.reshape(state,[1,agent.state_size])
        action=np.argmax(agent.model.predict(state)[0])
        nxt_state, reward, done, info = env.step(action)
        net_r=0
        while not done:
            net_r+=reward
            state=nxt_state
            state=np.reshape(state, [1,agent.state_size])
            action=np.argmax(agent.model.predict(state)[0])
            nxt_state, reward, done, info = env.step(action)
            #print(nxt_state, reward, done)
        avg+=net_r
        #print("Reward for {} episode: {}".format(i,net_r))
    
    agent.memory.clear()
    agent.pqt.clear()
    
    return([end_time-time_start, vally, avg/(i+1), agent])
        # if e % 10 == 0:
#     agent.save("./save/cartpole-dqn.h5")

In [6]:
#Set some hyperparameters gamma, epsilon_decay,min_decay
gammaLst=[0.8,0.85,0.9,0.95,1]
epsilon_decayLst=[0.5,0.7,0.8,0.9,0.95,0.98,0.985,0.99,0.995]

In [7]:
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent_hyp=[]
for i in range(len(gammaLst)):
    agent_gamma=[]
    for j in range(len(epsilon_decayLst)):
        finl=[]
        for po in range(3):
            finl.append(DQNAgent(state_size, action_size,epsilon_decayLst[j],gammaLst[i]))
        agent_gamma.append(finl)
    agent_hyp.append(agent_gamma)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [8]:
agent_hyp_perf=[]
crt=0
for i in range(len(gammaLst)):
    print('Gamma : ',gammaLst[i] )
    agent_gamma_perf=[]
    for j in range(len(epsilon_decayLst)):
        print('     Epsilon_Decay : ',epsilon_decayLst[j] )
        finl_perf=[]
        for po in range(3):
            finl_perf.append(do_it(agent_hyp[i][j][po]))
            print('            Trial : ',po+1 )
            print('            Time : ',finl_perf[po][0] )
            print('            Vally : ',finl_perf[po][1] )
            print('            Avg_evaluation : ',finl_perf[po][2] )
            crt+=1
            print('Done for :'+str(crt)+'/135')
        agent_gamma_perf.append([[epsilon_decayLst[j],gammaLst[i]],finl_perf])
    agent_hyp_perf.append(agent_gamma_perf)
    print("Done for gamma : ",gammaLst[i])
    print(agent_gamma_perf)

Gamma :  0.8
     Epsilon_Decay :  0.5
            Trial :  1
            Time :  121.74121117591858
            Vally :  []
            Avg_evaluation :  120.85
Done for :1/135
            Trial :  2
            Time :  165.40946245193481
            Vally :  [831, 861, 924, 957]
            Avg_evaluation :  153.86
Done for :2/135
            Trial :  3
            Time :  78.41466903686523
            Vally :  []
            Avg_evaluation :  8.35
Done for :3/135
     Epsilon_Decay :  0.7
            Trial :  1
            Time :  167.08403635025024
            Vally :  [885, 900, 981]
            Avg_evaluation :  124.46
Done for :4/135
            Trial :  2
            Time :  105.02711081504822
            Vally :  [943, 967]
            Avg_evaluation :  74.41
Done for :5/135
            Trial :  3
            Time :  74.88621354103088
            Vally :  []
            Avg_evaluation :  8.41
Done for :6/135
     Epsilon_Decay :  0.8
            Trial :  1
            Time :  

            Trial :  3
            Time :  110.5555329322815
            Vally :  []
            Avg_evaluation :  25.25
Done for :30/135
     Epsilon_Decay :  0.7
            Trial :  1
            Time :  178.38320994377136
            Vally :  [534, 535, 536, 537, 549, 550, 555, 556, 557, 559]
            Avg_evaluation :  489.71
Done for :31/135
            Trial :  2
            Time :  92.68818712234497
            Vally :  []
            Avg_evaluation :  8.4
Done for :32/135
            Trial :  3
            Time :  293.8559672832489
            Vally :  [350, 353, 378, 386, 396, 398, 406, 409, 411, 438]
            Avg_evaluation :  399.97
Done for :33/135
     Epsilon_Decay :  0.8
            Trial :  1
            Time :  562.1791880130768
            Vally :  [725, 891, 968, 970, 979]
            Avg_evaluation :  230.59
Done for :34/135
            Trial :  2
            Time :  383.0894191265106
            Vally :  [551, 591, 615, 625, 662, 681, 685, 699, 718, 723]
    

            Trial :  3
            Time :  232.46793937683105
            Vally :  [216, 270, 282, 286, 374, 379, 380, 381, 382, 383]
            Avg_evaluation :  498.96
Done for :57/135
     Epsilon_Decay :  0.7
            Trial :  1
            Time :  241.41136622428894
            Vally :  []
            Avg_evaluation :  98.16
Done for :58/135
            Trial :  2
            Time :  395.5391414165497
            Vally :  [338, 354, 368, 508, 530, 538, 544, 547, 554, 591]
            Avg_evaluation :  408.29
Done for :59/135
            Trial :  3
            Time :  119.20256638526917
            Vally :  []
            Avg_evaluation :  13.14
Done for :60/135
     Epsilon_Decay :  0.8
            Trial :  1
            Time :  227.5701184272766
            Vally :  []
            Avg_evaluation :  130.81
Done for :61/135
            Trial :  2
            Time :  344.102525472641
            Vally :  [300, 324, 326, 332, 333, 340, 342, 368, 370, 373]
            Avg_evaluati

            Trial :  2
            Time :  441.75407576560974
            Vally :  [455, 474, 476, 481, 482, 491, 531, 533, 535, 536]
            Avg_evaluation :  459.04
Done for :83/135
            Trial :  3
            Time :  545.9164481163025
            Vally :  [711, 757, 770, 777, 912, 915, 919, 928, 940, 944]
            Avg_evaluation :  435.19
Done for :84/135
     Epsilon_Decay :  0.7
            Trial :  1
            Time :  473.8448796272278
            Vally :  [768, 769, 773, 774]
            Avg_evaluation :  148.73
Done for :85/135
            Trial :  2
            Time :  796.1143538951874
            Vally :  [625, 674, 683, 697, 755, 962, 963, 975, 992]
            Avg_evaluation :  199.28
Done for :86/135
            Trial :  3
            Time :  330.33265137672424
            Vally :  [279, 293, 321, 371, 415, 439, 440, 441, 449, 451]
            Avg_evaluation :  418.77
Done for :87/135
     Epsilon_Decay :  0.8
            Trial :  1
            Time :  645

            Trial :  1
            Time :  901.4009943008423
            Vally :  []
            Avg_evaluation :  26.6
Done for :109/135
            Trial :  2
            Time :  269.86127400398254
            Vally :  []
            Avg_evaluation :  71.12
Done for :110/135
            Trial :  3
            Time :  323.1885316371918
            Vally :  [448, 522, 543, 549, 552, 584, 589, 590, 591, 592]
            Avg_evaluation :  329.38
Done for :111/135
     Epsilon_Decay :  0.7
            Trial :  1
            Time :  297.24556517601013
            Vally :  [312, 342, 414, 422, 424, 451, 452, 454, 455, 457]
            Avg_evaluation :  490.19
Done for :112/135
            Trial :  2
            Time :  717.5834314823151
            Vally :  [956, 998]
            Avg_evaluation :  335.94
Done for :113/135
            Trial :  3
            Time :  214.11640644073486
            Vally :  []
            Avg_evaluation :  12.16
Done for :114/135
     Epsilon_Decay :  0.8
     

In [6]:
import os
data_root='CartPole/Phase-3/Models'
for i in range(len(gammaLst)):
    for j in range(len(epsilon_decayLst)):
        for po in range(3):
            agent_hyp[i][j][po].save(os.path.join(data_root,'G'+str(gammaLst[i])+'ED'+str(epsilon_decayLst[j])+'trail'+str(po)+'.h5'))