In [1]:
import numpy as np
import random
from PIL import Image
from collections import deque
from keras.models import Sequential
from keras import initializers
from keras.layers import Dense, Conv2D, Dropout, Flatten
from keras.optimizers import Adam, Nadam, Adamax
sizes = (80,80,1)

class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=50000)
        self.gamma = 0.999   # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01  # exploration will not decay futher
        self.epsilon_decay = 0.000995
        self.learning_rate = 0.0001
        self.loss = 0
        self.model = self._build_model()
        self.weight_backup = 'model_weights.h5'

    def _build_model(self):
        model = Sequential()
        model.add(Conv2D(32, kernel_size=8, subsample=(4, 4), activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', padding='same', input_shape= sizes))#80*80*4
        model.add(Conv2D(64, kernel_size=4, subsample=(2, 2), activation='relu', padding='same'))
        model.add(Conv2D(64, kernel_size=3, subsample=(1, 1), activation='relu', padding='same'))
        model.add(Flatten())
        model.add(Dense(512, activation='relu'))
        model.add(Dense(self.action_size))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    
    def save_model(self):
            self.model.save(self.weight_backup)

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def remember(self, state, action, reward, new_state, done):
        if len(self.memory) >= 50000:
            self.memory.popleft()
            self.memory.append([state, action, reward, new_state, done])
        else:
            self.memory.append([state, action, reward, new_state, done])    

    def memory_replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        Sample = random.sample(self.memory, batch_size)
        
        inputs = np.zeros((batch_size, state.shape[1], state.shape[2], state.shape[3])) # minibatch input
        targets = np.zeros((inputs.shape[0], self.action_size))
        
        for i in range(0, len(Sample)):
            sample_state = Sample[i][0]
            sample_action = Sample[i][1]
            sample_reward = Sample[i][2]
            sample_new_state = Sample[i][3]
            sample_done = Sample[i][4]
            
            inputs[i:i+1] = sample_state # slice of inputs setting = to state
            
            targets[i] = self.model.predict(sample_state)
            future_reward = self.model.predict(sample_new_state)
            
            if done:
                targets[i, sample_action] = sample_reward
            else:
                targets[i, sample_action] = sample_reward + self.gamma * np.amax(future_reward)
            #print(sample_action, sample_reward, targets[i, sample_action])
        self.loss += self.model.train_on_batch(inputs, targets)
        print(self.loss)
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
    '''        
    def RGBprocess(new_state):
        processed_observation = Image.fromarray(new_state, 'RGB')
        processed_observation = processed_observation.convert('L')
        processed_observation = processed_observation.resize((80, 80))
        processed_observation = np.array(processed_observation)
        processed_observation = processed_observation.reshape(1, processed_observation.shape[0], processed_observation.shape[1], 1) #1x80x80x1
        return processed_observation
        #stack4.append(processed_observation)
        #if len(stack4) == 4:
            #stack_of_observation = np.stack((processed_observation, processed_observation, processed_observation, processed_observation), axis=2)
            #stack_of_observation = stack_of_observation.reshape(stack_of_observation.shape[0], stack_of_observation.shape[1], stack_of_observation.shape[3], stack_of_observation.shape[2])
            #print(stack_of_observation.shape)
    '''

Using TensorFlow backend.


In [None]:
import sys
import gym
from gym import wrappers
from scipy import misc

'''
def RGBprocess(raw_img):
        processed_observation = Image.fromarray(raw_img, 'RGB')
        processed_observation = processed_observation.convert('L')
        processed_observation = processed_observation.resize((80, 80))
        processed_observation = np.array(processed_observation)
        processed_observation = processed_observation.reshape(1, processed_observation.shape[0], processed_observation.shape[1], 1) #1x80x80x1
        return processed_observation
'''
def RGBprocess(raw_img):
    grayscale_observation = raw_img.mean(2)
    resized_observation = misc.imresize(grayscale_observation, (80, 80)).astype(np.float32)
    processed_observation = resized_observation.reshape(1, resized_observation.shape[0], resized_observation.shape[1], 1)
    return processed_observation


batch_size = 32
#episodes = sys.argv[1] if len(sys.argv) > 1 else 5000
#env_name = sys.argv[2] if len(sys.argv) > 2 else "Pong-v0"

episodes = 3000
env_name = "Pong-v0"
D = 80*80
prev_x = None

env = gym.make(env_name)

env = wrappers.Monitor(env, env_name, force=True)

agent = Agent(env.observation_space.shape, env.action_space.n)

for i_episodes in range(episodes):
    state = env.reset()
    totalreward = 0
    num_actions = 0
    short_mem = []
    done = False
    state = RGBprocess(state)
    while not done:
        #env.render()
        action = agent.act(state)
        new_state, reward, done, info = env.step(action)
        new_state = RGBprocess(new_state)
        new_state_dif = new_state - prev_x if prev_x is not None else new_state
        prev_x = new_state
        agent.remember(state, action, reward, new_state_dif, done)
        totalreward += reward
    agent.memory_replay(batch_size)
    state = new_state_dif
    if done:
        print("{} episode, score = {} ".format(i_episodes + 1, totalreward))
        agent.save_model()

env.close()
gym.upload(env_name, api_key='sk_WRCITkqmTJKYB9hvBk5tPA')

[2017-08-22 20:22:09,310] Making new env: Pong-v0
[2017-08-22 20:22:09,494] Clearing 8 monitor files from previous run (because force=True was provided)
[2017-08-22 20:22:09,576] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.31296.video000000.mp4
[2017-08-22 20:22:13,193] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.31296.video000001.mp4


29.5900745392
1 episode, score = -20.0 
46.3470096588
2 episode, score = -19.0 
67.1074466705
3 episode, score = -19.0 
81.0636081696
4 episode, score = -20.0 
85.7823677063
5 episode, score = -20.0 
87.3732366562
6 episode, score = -21.0 
91.1796820164
7 episode, score = -21.0 


[2017-08-22 20:22:32,030] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.31296.video000008.mp4


101.335907221
8 episode, score = -19.0 
107.225305319
9 episode, score = -21.0 
113.55968833
10 episode, score = -20.0 
117.468733549
11 episode, score = -20.0 
118.620779514
12 episode, score = -21.0 
119.140314221
13 episode, score = -21.0 
120.940241337
14 episode, score = -21.0 
122.373589039
15 episode, score = -21.0 
124.819231987
16 episode, score = -21.0 
127.498017311
17 episode, score = -20.0 
129.398235559
18 episode, score = -21.0 
130.782907009
19 episode, score = -20.0 
132.446739674
20 episode, score = -20.0 
133.159897685
21 episode, score = -21.0 
133.387081653
22 episode, score = -21.0 
133.521993235
23 episode, score = -21.0 
134.060242012
24 episode, score = -21.0 
134.431001797
25 episode, score = -20.0 
135.677554145
26 episode, score = -20.0 


[2017-08-22 20:23:16,400] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.31296.video000027.mp4


136.684209839
27 episode, score = -21.0 
137.512863651
28 episode, score = -21.0 
137.857785299
29 episode, score = -19.0 
137.988484576
30 episode, score = -20.0 
138.03554143
31 episode, score = -21.0 
138.080526672
32 episode, score = -20.0 
138.1531545
33 episode, score = -20.0 
138.401482262
34 episode, score = -19.0 
138.719810791
35 episode, score = -21.0 
139.013596006
36 episode, score = -21.0 
139.25133393
37 episode, score = -21.0 
139.422856696
38 episode, score = -20.0 
139.525238924
39 episode, score = -20.0 
139.561391242
40 episode, score = -21.0 
139.58637985
41 episode, score = -20.0 
139.599776351
42 episode, score = -21.0 
139.643788875
43 episode, score = -19.0 
139.706657738
44 episode, score = -20.0 
139.752507654
45 episode, score = -20.0 
139.814424255
46 episode, score = -21.0 
139.880531349
47 episode, score = -19.0 
139.910292574
48 episode, score = -18.0 
139.931503759
49 episode, score = -21.0 
139.964087323
50 episode, score = -20.0 
140.003978749
51 epis

[2017-08-22 20:24:45,955] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.31296.video000064.mp4


140.239837226
64 episode, score = -21.0 
140.254271724
65 episode, score = -20.0 
140.280072086
66 episode, score = -21.0 
140.291393061
67 episode, score = -20.0 
140.292914033
68 episode, score = -20.0 
140.293723033
69 episode, score = -21.0 
140.296579257
70 episode, score = -21.0 
140.301805101
71 episode, score = -21.0 
140.305266685
72 episode, score = -21.0 
140.315811352
73 episode, score = -21.0 
140.321911973
74 episode, score = -20.0 
140.332183973
75 episode, score = -20.0 
140.344028368
76 episode, score = -21.0 
140.348917074
77 episode, score = -20.0 
140.353970408
78 episode, score = -20.0 
140.360255174
79 episode, score = -20.0 
140.361422035
80 episode, score = -21.0 
140.369489948
81 episode, score = -19.0 
140.379082852
82 episode, score = -21.0 
140.382655709
83 episode, score = -21.0 
140.396535011
84 episode, score = -21.0 
140.397137369
85 episode, score = -21.0 
140.40316955
86 episode, score = -21.0 
140.4056625
87 episode, score = -21.0 
140.408542444
88 ep

[2017-08-22 20:27:13,898] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.31296.video000125.mp4


140.573260604
125 episode, score = -19.0 
140.57384285
126 episode, score = -21.0 
140.579279276
127 episode, score = -21.0 
140.58074973
128 episode, score = -21.0 
140.581850284
129 episode, score = -21.0 
140.582441267
130 episode, score = -21.0 
140.594107552
131 episode, score = -20.0 
140.594361377
132 episode, score = -21.0 
140.600441176
133 episode, score = -21.0 
140.600736587
134 episode, score = -21.0 
140.600973744
135 episode, score = -21.0 
140.60137534
136 episode, score = -21.0 
140.601724169
137 episode, score = -21.0 
140.601897095
138 episode, score = -21.0 
140.612751959
139 episode, score = -20.0 
140.612871628
140 episode, score = -21.0 
140.61322676
141 episode, score = -20.0 
140.614091011
142 episode, score = -21.0 
140.614306389
143 episode, score = -21.0 
140.629077413
144 episode, score = -21.0 
140.639661347
145 episode, score = -20.0 
140.640420226
146 episode, score = -19.0 
140.646162324
147 episode, score = -20.0 
140.646877135
148 episode, score = -21

[2017-08-22 20:31:20,294] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.31296.video000216.mp4


140.901975777
216 episode, score = -21.0 
140.90807564
217 episode, score = -21.0 
140.914262977
218 episode, score = -21.0 
140.920321093
219 episode, score = -21.0 
140.926252486
220 episode, score = -21.0 
140.927780568
221 episode, score = -21.0 
140.933181936
222 episode, score = -21.0 
140.939849399
223 episode, score = -21.0 
140.946284765
224 episode, score = -21.0 
140.947000527
225 episode, score = -21.0 
140.952979049
226 episode, score = -20.0 
140.972640526
227 episode, score = -21.0 
140.978140831
228 episode, score = -20.0 
140.979466601
229 episode, score = -21.0 
140.982281257
230 episode, score = -21.0 
140.990269565
231 episode, score = -21.0 
141.000983858
232 episode, score = -21.0 
141.001182577
233 episode, score = -20.0 
141.002129731
234 episode, score = -21.0 
141.00416095
235 episode, score = -20.0 
141.01378029
236 episode, score = -21.0 
141.014011146
237 episode, score = -21.0 
141.021172631
238 episode, score = -21.0 
141.025864075
239 episode, score = -2

[2017-08-22 20:37:19,002] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.31296.video000343.mp4


141.524601852
343 episode, score = -21.0 
141.529653287
344 episode, score = -21.0 
141.535340382
345 episode, score = -21.0 
141.537203339
346 episode, score = -21.0 
141.538511683
347 episode, score = -20.0 
141.544378365
348 episode, score = -21.0 
141.549416007
349 episode, score = -21.0 
141.551207854
350 episode, score = -21.0 
141.556156618
351 episode, score = -21.0 
141.558198373
352 episode, score = -21.0 
141.564998961
353 episode, score = -20.0 
141.566780187
354 episode, score = -21.0 
141.567789683
355 episode, score = -20.0 
141.568562852
356 episode, score = -21.0 
141.575064132
357 episode, score = -20.0 
141.577458783
358 episode, score = -21.0 
141.578590603
359 episode, score = -21.0 
141.579463523
360 episode, score = -21.0 
141.579921131
361 episode, score = -21.0 
141.591666258
362 episode, score = -20.0 
141.59418978
363 episode, score = -21.0 
141.597287015
364 episode, score = -21.0 
141.604619231
365 episode, score = -21.0 
141.612075503
366 episode, score = 

[2017-08-22 20:46:30,385] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.31296.video000512.mp4


142.311492588
512 episode, score = -21.0 
142.323664526
513 episode, score = -21.0 
142.330757732
514 episode, score = -20.0 
142.33289192
515 episode, score = -21.0 
142.339307874
516 episode, score = -21.0 
142.347067024
517 episode, score = -21.0 
142.347855119
518 episode, score = -20.0 
142.349786276
519 episode, score = -18.0 
142.353127099
520 episode, score = -21.0 
142.354752616
521 episode, score = -21.0 
142.355885133
522 episode, score = -20.0 
142.357133809
523 episode, score = -21.0 
142.363523074
524 episode, score = -21.0 
142.373286476
525 episode, score = -21.0 
142.374053966
526 episode, score = -21.0 
142.375172134
527 episode, score = -19.0 
142.376402114
528 episode, score = -21.0 
142.383047219
529 episode, score = -21.0 
142.390619871
530 episode, score = -21.0 
142.399992162
531 episode, score = -21.0 
142.405802694
532 episode, score = -21.0 
142.415709743
533 episode, score = -21.0 
142.433997909
534 episode, score = -21.0 
142.441366097
535 episode, score = 

143.759911777
708 episode, score = -19.0 
143.771579612
709 episode, score = -21.0 
143.774508159
710 episode, score = -20.0 
143.792519362
711 episode, score = -21.0 
143.795926854
712 episode, score = -21.0 
143.805123015
713 episode, score = -21.0 
143.809357672
714 episode, score = -19.0 
143.818540257
715 episode, score = -21.0 
143.821614874
716 episode, score = -20.0 
143.8307531
717 episode, score = -21.0 
143.837928168
718 episode, score = -21.0 
143.841366449
719 episode, score = -20.0 
143.843499121
720 episode, score = -21.0 
143.844183875
721 episode, score = -20.0 
143.855595953
722 episode, score = -19.0 
143.859079314
723 episode, score = -21.0 
143.873607477
724 episode, score = -21.0 
143.8757067
725 episode, score = -21.0 
143.882957891
726 episode, score = -21.0 
143.88618961
727 episode, score = -21.0 
143.887982598
728 episode, score = -21.0 


[2017-08-22 21:00:10,155] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.31296.video000729.mp4


143.891450069
729 episode, score = -21.0 
143.894030363
730 episode, score = -18.0 
143.895762981
731 episode, score = -21.0 
143.89808027
732 episode, score = -21.0 
143.913964548
733 episode, score = -21.0 
143.921596556
734 episode, score = -20.0 
143.923654323
735 episode, score = -21.0 
143.926347674
736 episode, score = -21.0 
143.929743934
737 episode, score = -21.0 
143.941655277
738 episode, score = -19.0 
143.943130874
739 episode, score = -21.0 
143.945666143
740 episode, score = -21.0 
143.948435949
741 episode, score = -21.0 
143.959494353
742 episode, score = -21.0 
143.970977564
743 episode, score = -21.0 
143.972985379
744 episode, score = -21.0 
143.983057769
745 episode, score = -21.0 
143.988364865
746 episode, score = -21.0 
143.99226985
747 episode, score = -20.0 
143.993157751
748 episode, score = -21.0 
143.994686519
749 episode, score = -21.0 
144.004439509
750 episode, score = -21.0 
144.017831509
751 episode, score = -21.0 
144.029030123
752 episode, score = -

145.471189529
925 episode, score = -21.0 
145.473002719
926 episode, score = -21.0 
145.475238049
927 episode, score = -21.0 
145.478059519
928 episode, score = -21.0 
145.480317518
929 episode, score = -21.0 
145.480896729
930 episode, score = -21.0 
145.491797265
931 episode, score = -21.0 
145.494443309
932 episode, score = -21.0 
145.497984644
933 episode, score = -21.0 
145.499830087
934 episode, score = -21.0 
145.517470562
935 episode, score = -21.0 
145.519676568
936 episode, score = -21.0 
145.530167965
937 episode, score = -21.0 
145.543101201
938 episode, score = -20.0 
145.555826111
939 episode, score = -21.0 
145.571461965
940 episode, score = -21.0 
145.575389127
941 episode, score = -21.0 
145.579308514
942 episode, score = -21.0 
145.582970433
943 episode, score = -20.0 
145.591754935
944 episode, score = -20.0 
145.599873737
945 episode, score = -21.0 
145.60747009
946 episode, score = -21.0 
145.621480009
947 episode, score = -20.0 
145.644419445
948 episode, score = 

[2017-08-22 21:19:37,425] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.31296.video001000.mp4


146.253819185
1000 episode, score = -21.0 
146.257999105
1001 episode, score = -21.0 
146.275203723
1002 episode, score = -21.0 
146.289087691
1003 episode, score = -21.0 
146.299595002
1004 episode, score = -21.0 
146.304765178
1005 episode, score = -21.0 
146.329428686
1006 episode, score = -21.0 
146.335913537
1007 episode, score = -21.0 
146.364399811
1008 episode, score = -21.0 
146.38752472
1009 episode, score = -21.0 
146.399746656
1010 episode, score = -21.0 
146.422694858
1011 episode, score = -21.0 
146.445935759
1012 episode, score = -21.0 
146.450769716
1013 episode, score = -21.0 
146.470146485
1014 episode, score = -21.0 
146.475583097
1015 episode, score = -21.0 
146.489080294
1016 episode, score = -21.0 
146.505494701
1017 episode, score = -16.0 
146.514141765
1018 episode, score = -21.0 
146.521535031
1019 episode, score = -21.0 
146.536858424
1020 episode, score = -21.0 
146.544894481
1021 episode, score = -21.0 
146.556330097
1022 episode, score = -21.0 
146.56176423

147.80745214
1191 episode, score = -21.0 
147.809245188
1192 episode, score = -21.0 
147.810329938
1193 episode, score = -20.0 
147.816622889
1194 episode, score = -21.0 
147.817728304
1195 episode, score = -21.0 
147.819126308
1196 episode, score = -21.0 
147.820085943
1197 episode, score = -21.0 
147.832930943
1198 episode, score = -21.0 
147.839366315
1199 episode, score = -21.0 
147.843824412
1200 episode, score = -21.0 
147.858977481
1201 episode, score = -21.0 
147.871998693
1202 episode, score = -21.0 
147.879232559
1203 episode, score = -21.0 
147.882364278
1204 episode, score = -21.0 
147.893491237
1205 episode, score = -21.0 
147.894877631
1206 episode, score = -21.0 
147.895242603
1207 episode, score = -21.0 
147.901059166
1208 episode, score = -21.0 
147.909463134
1209 episode, score = -21.0 
147.910317738
1210 episode, score = -21.0 
147.912417887
1211 episode, score = -21.0 
147.91975284
1212 episode, score = -21.0 
147.930479756
1213 episode, score = -21.0 
147.935391758

149.45003999
1383 episode, score = -21.0 
149.46433496
1384 episode, score = -21.0 
149.475841909
1385 episode, score = -21.0 
149.494842875
1386 episode, score = -21.0 
149.502376061
1387 episode, score = -21.0 
149.515043548
1388 episode, score = -21.0 
149.529769512
1389 episode, score = -21.0 
149.549813408
1390 episode, score = -21.0 
149.56014405
1391 episode, score = -21.0 
149.570805963
1392 episode, score = -21.0 
149.580026233
1393 episode, score = -21.0 
149.584862443
1394 episode, score = -21.0 
149.590016213
1395 episode, score = -21.0 
149.600326948
1396 episode, score = -21.0 
149.602300244
1397 episode, score = -21.0 
149.610705398
1398 episode, score = -21.0 
149.630845897
1399 episode, score = -21.0 
149.635369495
1400 episode, score = -21.0 
149.64617578
1401 episode, score = -21.0 
149.657180726
1402 episode, score = -21.0 
149.661503615
1403 episode, score = -21.0 
149.667450474
1404 episode, score = -21.0 
149.671019495
1405 episode, score = -21.0 
149.674555469
1

In [None]:
import gym
from gym import wrappers
gym.upload(env_name, api_key='sk_WRCITkqmTJKYB9hvBk5tPA')

In [None]:
processed_observation.shape

In [None]:
reward

In [None]:
test = agent.model.predict(state)

In [None]:
test[0]

In [None]:
env.reset()
env.close()

In [None]:
len(agent.memory)

In [None]:
processed_observation = Image.fromarray(state, 'RGB')
processed_observation = processed_observation.convert('L')
processed_observation = processed_observation.resize((80, 80))
processed_observation = np.array(processed_observation)
processed_observation = processed_observation.reshape(processed_observation.shape[0], processed_observation.shape[1], 1)

In [None]:
new_state

In [None]:
def RGBprocess(new_state):
        processed_observation = Image.fromarray(new_state, 'RGB')
        processed_observation = processed_observation.convert('L')
        processed_observation = processed_observation.resize((80, 80))
        processed_observation = np.array(processed_observation)
        processed_observation = processed_observation.reshape(processed_observation.shape[0], processed_observation.shape[1], 1) #1x80x80x1
        return processed_observation

In [None]:
new_state_dif = new_state_dif.reshape(new_state_dif.shape[1], new_state_dif.shape[2])
img = Image.fromarray(new_state_dif, 'L')

In [None]:
#img = Image.fromarray(state, 'L')
img.show()

In [None]:
thing = [1,2,3,4,5,6]
for x in thing[-2:]:
    print(x)

In [None]:
        # way to adjust reward, think incorrect
        if reward == 0:
            num_actions += 1
            short_mem.append([state, action, reward, new_state_dif, done])
        else:
            num_actions += 1
            short_mem.append([state, action, reward, new_state_dif, done])
            if reward == -1.0:
                for m in short_mem:
                    m[2] = -1.0
                    agent.remember(m[0], m[1], m[2], m[3], m[4])
                num_actions = 0
            elif reward == 1.0:
                for m in short_mem:
                    m[2] = 1.0
                    agent.remember(m[0], m[1], m[2], m[3], m[4])
            short_mem = []

In [None]:
        # early attempt at increasing samples with positive reward
        winsample = [s for s in self.memory if s[2]== 1.0]
        #print(winsample)
        tuple(winsample)
        if len(winsample) > 4:
            Samplewin = random.sample(winsample, 4)
            Sample += Samplewin
        else:  
            Sample += winsample