In [1]:
import random
import gym
import numpy as np
from collections import deque


from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor
from scores.score_logger import ScoreLogger
from sklearn.model_selection import train_test_split

ENV_NAME = "CartPole-v1"

GAMMA = 0.95
LEARNING_RATE = 0.001

MEMORY_SIZE = 1000
BATCH_SIZE = 20

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.05
EXPLORATION_DECAY = 0.96

In [2]:
class DQNSolver:

    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX

        self.action_space = action_space
        self.memory = deque(maxlen=MEMORY_SIZE)

        self.model = MultiOutputRegressor(LGBMRegressor(n_estimators=100, n_jobs=-1))
        #self.model = KNeighborsRegressor(n_jobs=-1)
        #self.model = MultiOutputRegressor(SVR(), n_jobs=8)
        self.isFit = False

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        if self.isFit == True:
            q_values = self.model.predict(state)
        else:
            q_values = np.zeros(self.action_space).reshape(1, -1)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, int(len(self.memory)/1))
        X = []
        targets = []
        for state, action, reward, state_next, terminal in batch:
            q_update = reward
            if not terminal:
                if self.isFit:
                    q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
                    #print(self.model.predict(state_next))
                else:
                    q_update = reward
            if self.isFit:
                q_values = self.model.predict(state)
            else:
                q_values = np.zeros(self.action_space).reshape(1, -1)
            q_values[0][action] = q_update
            
            #print(state)
            #print(action)
            #print(q_values)
            X.append(list(state[0]))
            targets.append(q_values[0])
        #print(X)
        #print(targets)
        self.model.fit(X, targets)
        self.isFit = True
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)

In [3]:
env = gym.make(ENV_NAME)
score_logger = ScoreLogger(ENV_NAME)
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n
dqn_solver = DQNSolver(observation_space, action_space)
run = 0
while True:
    run += 1
    state = env.reset()
    state = np.reshape(state, [1, observation_space])
    step = 0
    while True:
        step += 1
        #env.render()
        action = dqn_solver.act(state)
        state_next, reward, terminal, info = env.step(action)
        reward = reward if not terminal else -reward
        state_next = np.reshape(state_next, [1, observation_space])
        dqn_solver.remember(state, action, reward, state_next, terminal)
        state = state_next
        if terminal:
            print("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))
            score_logger.add_score(step, run)
            break
    dqn_solver.experience_replay()

Run: 1, exploration: 1.0, score: 15
Scores: (min: 15, avg: 15, max: 15)

Run: 2, exploration: 1.0, score: 13


  z = np.polyfit(np.array(trend_x), np.array(y[1:]), 1)


Scores: (min: 13, avg: 14, max: 15)

Run: 3, exploration: 0.96, score: 15
Scores: (min: 13, avg: 14.333333333333334, max: 15)

Run: 4, exploration: 0.9216, score: 13
Scores: (min: 13, avg: 14, max: 15)

Run: 5, exploration: 0.884736, score: 18
Scores: (min: 13, avg: 14.8, max: 18)

Run: 6, exploration: 0.84934656, score: 9
Scores: (min: 9, avg: 13.833333333333334, max: 18)

Run: 7, exploration: 0.8153726976, score: 15
Scores: (min: 9, avg: 14, max: 18)

Run: 8, exploration: 0.782757789696, score: 15
Scores: (min: 9, avg: 14.125, max: 18)

Run: 9, exploration: 0.7514474781081599, score: 11
Scores: (min: 9, avg: 13.777777777777779, max: 18)

Run: 10, exploration: 0.7213895789838335, score: 19
Scores: (min: 9, avg: 14.3, max: 19)

Run: 11, exploration: 0.6925339958244802, score: 20
Scores: (min: 9, avg: 14.818181818181818, max: 20)

Run: 12, exploration: 0.6648326359915009, score: 16
Scores: (min: 9, avg: 14.916666666666666, max: 20)

Run: 13, exploration: 0.6382393305518408, score: 16
Sc

Run: 86, exploration: 0.05, score: 117
Scores: (min: 9, avg: 72.18604651162791, max: 190)

Run: 87, exploration: 0.05, score: 116
Scores: (min: 9, avg: 72.6896551724138, max: 190)

Run: 88, exploration: 0.05, score: 109
Scores: (min: 9, avg: 73.10227272727273, max: 190)

Run: 89, exploration: 0.05, score: 120
Scores: (min: 9, avg: 73.62921348314607, max: 190)

Run: 90, exploration: 0.05, score: 109
Scores: (min: 9, avg: 74.02222222222223, max: 190)

Run: 91, exploration: 0.05, score: 114
Scores: (min: 9, avg: 74.46153846153847, max: 190)

Run: 92, exploration: 0.05, score: 111
Scores: (min: 9, avg: 74.8586956521739, max: 190)

Run: 93, exploration: 0.05, score: 106
Scores: (min: 9, avg: 75.19354838709677, max: 190)

Run: 94, exploration: 0.05, score: 103
Scores: (min: 9, avg: 75.48936170212765, max: 190)

Run: 95, exploration: 0.05, score: 107
Scores: (min: 9, avg: 75.82105263157895, max: 190)

Run: 96, exploration: 0.05, score: 106
Scores: (min: 9, avg: 76.13541666666667, max: 190)

R

Scores: (min: 51, avg: 117.73, max: 175)

Run: 186, exploration: 0.05, score: 120
Scores: (min: 51, avg: 117.76, max: 175)

Run: 187, exploration: 0.05, score: 130
Scores: (min: 51, avg: 117.9, max: 175)

Run: 188, exploration: 0.05, score: 109
Scores: (min: 51, avg: 117.9, max: 175)

Run: 189, exploration: 0.05, score: 116
Scores: (min: 51, avg: 117.86, max: 175)

Run: 190, exploration: 0.05, score: 182
Scores: (min: 51, avg: 118.59, max: 182)

Run: 191, exploration: 0.05, score: 102
Scores: (min: 51, avg: 118.47, max: 182)

Run: 192, exploration: 0.05, score: 123
Scores: (min: 51, avg: 118.59, max: 182)

Run: 193, exploration: 0.05, score: 125
Scores: (min: 51, avg: 118.78, max: 182)

Run: 194, exploration: 0.05, score: 119
Scores: (min: 51, avg: 118.94, max: 182)

Run: 195, exploration: 0.05, score: 105
Scores: (min: 51, avg: 118.92, max: 182)

Run: 196, exploration: 0.05, score: 116
Scores: (min: 51, avg: 119.02, max: 182)

Run: 197, exploration: 0.05, score: 138
Scores: (min: 51, 

Run: 286, exploration: 0.05, score: 122
Scores: (min: 63, avg: 130.83, max: 246)

Run: 287, exploration: 0.05, score: 125
Scores: (min: 63, avg: 130.78, max: 246)

Run: 288, exploration: 0.05, score: 129
Scores: (min: 63, avg: 130.98, max: 246)

Run: 289, exploration: 0.05, score: 152
Scores: (min: 63, avg: 131.34, max: 246)

Run: 290, exploration: 0.05, score: 158
Scores: (min: 63, avg: 131.1, max: 246)

Run: 291, exploration: 0.05, score: 100
Scores: (min: 63, avg: 131.08, max: 246)

Run: 292, exploration: 0.05, score: 137
Scores: (min: 63, avg: 131.22, max: 246)

Run: 293, exploration: 0.05, score: 131
Scores: (min: 63, avg: 131.28, max: 246)

Run: 294, exploration: 0.05, score: 157
Scores: (min: 63, avg: 131.66, max: 246)

Run: 295, exploration: 0.05, score: 206
Scores: (min: 63, avg: 132.67, max: 246)

Run: 296, exploration: 0.05, score: 124
Scores: (min: 63, avg: 132.75, max: 246)

Run: 297, exploration: 0.05, score: 131
Scores: (min: 63, avg: 132.68, max: 246)

Run: 298, explora

Scores: (min: 60, avg: 144.76, max: 336)

Run: 387, exploration: 0.05, score: 141
Scores: (min: 60, avg: 144.92, max: 336)

Run: 388, exploration: 0.05, score: 160
Scores: (min: 60, avg: 145.23, max: 336)

Run: 389, exploration: 0.05, score: 181
Scores: (min: 60, avg: 145.52, max: 336)

Run: 390, exploration: 0.05, score: 153
Scores: (min: 60, avg: 145.47, max: 336)

Run: 391, exploration: 0.05, score: 191
Scores: (min: 60, avg: 146.38, max: 336)

Run: 392, exploration: 0.05, score: 194
Scores: (min: 60, avg: 146.95, max: 336)

Run: 393, exploration: 0.05, score: 207
Scores: (min: 60, avg: 147.71, max: 336)

Run: 394, exploration: 0.05, score: 127
Scores: (min: 60, avg: 147.41, max: 336)

Run: 395, exploration: 0.05, score: 229
Scores: (min: 60, avg: 147.64, max: 336)

Run: 396, exploration: 0.05, score: 119
Scores: (min: 60, avg: 147.59, max: 336)

Run: 397, exploration: 0.05, score: 123
Scores: (min: 60, avg: 147.51, max: 336)

Run: 398, exploration: 0.05, score: 54
Scores: (min: 54,

Run: 487, exploration: 0.05, score: 179
Scores: (min: 33, avg: 191.68, max: 500)

Run: 488, exploration: 0.05, score: 216
Scores: (min: 33, avg: 192.24, max: 500)

Run: 489, exploration: 0.05, score: 172
Scores: (min: 33, avg: 192.15, max: 500)

Run: 490, exploration: 0.05, score: 157
Scores: (min: 33, avg: 192.19, max: 500)

Run: 491, exploration: 0.05, score: 213
Scores: (min: 33, avg: 192.41, max: 500)

Run: 492, exploration: 0.05, score: 175
Scores: (min: 33, avg: 192.22, max: 500)

Run: 493, exploration: 0.05, score: 192
Scores: (min: 33, avg: 192.07, max: 500)

Run: 494, exploration: 0.05, score: 188
Scores: (min: 33, avg: 192.68, max: 500)

Run: 495, exploration: 0.05, score: 239
Scores: (min: 33, avg: 192.78, max: 500)

Run: 496, exploration: 0.05, score: 176
Scores: (min: 33, avg: 193.35, max: 500)

Run: 497, exploration: 0.05, score: 182
Scores: (min: 33, avg: 193.94, max: 500)

Run: 498, exploration: 0.05, score: 185
Scores: (min: 33, avg: 195.25, max: 500)

Solved in 398 ru

NameError: name 'exit' is not defined