In [452]:
import gym
import random
import numpy as np
from statistics import median, mean
from collections import Counter
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras import backend as K
import matplotlib.pyplot as plt
from IPython.display import clear_output
from collections import deque

In [453]:
LR = 1e-3
env = gym.make("CartPole-v1")
goal_steps = 500
EPISODES = 2000

CartPoleEnv - Version 0.2.0, Noise case: 1


In [454]:
#Plot score over episodes
def plot_res(values, title=''):   
    clear_output(wait=True)
    
    # Define the figure
    f, ax = plt.subplots(nrows=1, ncols=2, figsize=(12,5))
    f.suptitle(title)
    ax[0].plot(values, label='score per run')
    ax[0].axhline(100, c='red',ls='--', label='goal')
    ax[0].set_xlabel('Episodes')
    ax[0].set_ylabel('Reward')
    x = range(len(values))
    ax[0].legend()
    # Calculate the trend
    try:
        z = np.polyfit(x, values, 1)
        p = np.poly1d(z)
        ax[0].plot(x,p(x),"--", label='trend')
    except:
        print('')
    
    # Plot the histogram of results
    ax[1].hist(values[-50:])
    ax[1].axvline(500, c='red', label='goal')
    ax[1].set_xlabel('Scores per Last 50 Episodes')
    ax[1].set_ylabel('Frequency')
    ax[1].legend()
    plt.show()

In [455]:
def neural_network_model(input_size, output_size):

    model = Sequential()
    
    model.add(Dense(24, input_dim=input_size,activation='relu'))
    model.add(Dense(24, activation='relu'))
    #model.add(Dropout(0.3))
    #model.add(Dense(256,kernel_initializer='normal', activation='relu'))
    #model.add(Dropout(0.3))
    #model.add(Dense(512,kernel_initializer='normal', activation='relu'))
    #model.add(Dropout(0.3))
    #model.add(Dense(256,kernel_initializer='normal', activation='relu'))
    #model.add(Dropout(0.3))
    #model.add(Dense(128,kernel_initializer='normal', activation='relu'))
    #model.add(Dropout(0.3))
    model.add(Dense(output_size,activation='linear'))
    
    model.compile(loss='mse', optimizer=Adam(lr=0.001), metrics=['accuracy'])
    
    return model

In [456]:
input_size = env.observation_space.shape[0]
output_size = env.action_space.n

print(input_size)
print(output_size)

model = neural_network_model(input_size, output_size)
target_model = neural_network_model(input_size, output_size)

4
2


In [457]:
epsilon = 1.0 #Used to define the exploration range
epsilon_min = 0.01
epsilon_decay = 0.995

def next_action(state, output_size):
    if(np.random.rand() <= epsilon): #Explore: Random action
        return random.randrange(output_size)
    return np.argmax(model.predict(state.reshape(-1, len(state)))) #Exploitation: Action with best possible reward

In [458]:
gamma = 0.9


def replay(model, training_data, train_size):
    global epsilon, epsilon_min, epsilon_decay
    
    X = []
    y = []
    
    batch_data = random.sample(training_data, train_size)
    for t in batch_data:
        state, reward, done, new_state, action = t
        
        yi = model.predict(state)
        
        #X.append(state)
        #y.append(yi[0])
        if(done):
            yi[0][action] = reward
        else:
            a = model.predict(new_state)[0]
            t = target_model.predict(new_state)[0]
            yi[0][action] = reward + gamma * t[np.argmax(a)]
        #state=state.tolist()[0]
        #yi=yi.tolist()[0]
        model.fit(state, yi, epochs=1, verbose=0)
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    #X = np.array([x[0] for x in training_data])
    #y = [int(x[4]) for x in training_data]
    #X = np.array(X)
    #y = np.array(y)
    
    #model.fit(X, y, epochs=1, verbose = 0)
    return model

In [None]:
scores = []
game_memory = deque(maxlen=100000)

train_size = 32

for game in range(EPISODES):
    state = env.reset()
    #env.render()
    score = 0
    for _ in range(goal_steps):
        action = next_action(state, output_size)
        
        state = state.reshape(-1, len(state))
                
        new_state, reward, done, info = env.step(action)
        
        score += reward
        if done:
            reward = -reward
        else:
            reward = reward
        
        game_memory.append((state, reward, done, new_state.reshape(-1, len(new_state)), action))
        
        state = new_state
        if done: 
            target_model.set_weights(model.get_weights())
            #print("episode: {}/{}, score: {}, e: {:.2}".format(game, EPISODES, _, epsilon))
            break
            
    if(len(game_memory) > train_size):
        replay(model, game_memory, train_size)
    
    if(game % 100 == 0):
        print('Game {} score : {}'.format(game, score))
        
    scores.append(score)
    #plot_res(scores)

print('Average Score:',sum(scores)/len(scores))

Game 0 score : 14.0
Game 100 score : 26.0
Game 200 score : 22.0
Game 300 score : 77.0
