In [1]:
import numpy as np
import random
from PIL import Image
from collections import deque
from keras.models import Sequential
from keras import initializers
from keras.layers import Dense, Conv2D, Dropout, Flatten
from keras.optimizers import Adam, Nadam, Adamax
sizes = (84,84,4)

class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=50000)
        self.gamma = 0.99   # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.00  # exploration will not decay futher
        self.epsilon_decay = 0.000995
        self.learning_rate = 0.001
        self.loss = 0
        self.model = self._build_model()
        self.weight_backup = 'model_weights.h5'

    def _build_model(self):
        model = Sequential()
        model.add(Conv2D(32, kernel_size=8, subsample=(4, 4), activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', padding='same', input_shape= sizes))#80*80*4
        model.add(Conv2D(64, kernel_size=4, subsample=(2, 2), activation='relu', padding='same'))
        model.add(Conv2D(64, kernel_size=3, subsample=(1, 1), activation='relu', padding='same'))
        model.add(Flatten())
        model.add(Dense(256, activation='relu'))
        model.add(Dense(self.action_size))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    
    def save_model(self):
            self.model.save(self.weight_backup)

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            act_values = self.model.predict(state)
            #print(act_values)
            return np.argmax(act_values[0])

    def stack(self, processed_observation, new_game):
        global s_t, s_t1
        if new_game:
            s_t = np.stack((processed_observation, processed_observation, processed_observation, processed_observation), axis=2)
            s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2])
            processed_stack = s_t
            return processed_stack
        else:
            x_t1 = processed_observation.reshape(1, processed_observation.shape[0], processed_observation.shape[1], 1)
            s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3)
            processed_stack = s_t1
            return processed_stack

    def remember(self, state, action, reward, new_state, done):
        if len(self.memory) >= 50000:
            self.memory.popleft()
            self.memory.append([state, action, reward, new_state, done])
        else:
            self.memory.append([state, action, reward, new_state, done])    

    def memory_replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        Sample = random.sample(self.memory, batch_size)
        '''
        winsample = [s for s in self.memory if s[2]== 1.0]
        #print(winsample)
        tuple(winsample)
        if len(winsample) > 1:
            Samplewin = random.sample(winsample, 1)
            Sample += Samplewin
        else:  
            Sample += winsample
        '''
        inputs = np.zeros((len(Sample), Sample[0][0].shape[1], Sample[0][0].shape[2], Sample[0][0].shape[3])) # minibatch input
        targets = np.zeros((inputs.shape[0], self.action_size))
        
        for i in range(0, len(Sample)):
            sample_state = Sample[i][0]
            sample_action = Sample[i][1]
            sample_reward = Sample[i][2]
            sample_new_state = Sample[i][3]
            sample_done = Sample[i][4]
            
#             xxx = sample_new_state.reshape(sample_new_state.shape[1], sample_new_state.shape[2])
#             img = Image.fromarray(xxx, 'L')
#             img.show()
            
            inputs[i:i+1] = sample_state # slice of inputs setting = to state
            
            targets[i] = self.model.predict(sample_state)
            future_reward = self.model.predict(sample_new_state)
            
            if sample_done:
                targets[i, sample_action] = sample_reward
            #elif sample_reward == 1.0:
                #targets[i, sample_action] = sample_reward
            #elif sample_reward == -1.0:
                #targets[i, sample_action] = sample_reward
            else:
                targets[i, sample_action] = sample_reward + self.gamma * np.max(future_reward)
            #print(sample_action, sample_reward, targets[i, sample_action])
            #print(targets)
            #print(sample_action)
        self.loss += self.model.train_on_batch(inputs, targets)
        #print(self.loss)
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
    '''        
    def RGBprocess(new_state):
        processed_observation = Image.fromarray(new_state, 'RGB')
        processed_observation = processed_observation.convert('L')
        processed_observation = processed_observation.resize((80, 80))
        processed_observation = np.array(processed_observation)
        processed_observation = processed_observation.reshape(1, processed_observation.shape[0], processed_observation.shape[1], 1) #1x80x80x1
        return processed_observation
        #stack4.append(processed_observation)
        #if len(stack4) == 4:
            #stack_of_observation = np.stack((processed_observation, processed_observation, processed_observation, processed_observation), axis=2)
            #stack_of_observation = stack_of_observation.reshape(stack_of_observation.shape[0], stack_of_observation.shape[1], stack_of_observation.shape[3], stack_of_observation.shape[2])
            #print(stack_of_observation.shape)
    '''

Using TensorFlow backend.


In [2]:
import sys
import gym
from gym import wrappers
from scipy import misc
import skimage as skimage
from skimage import color
from skimage import transform
from skimage import util
from skimage import exposure
#from skimage.viewer import ImageViewer
#import cv2

'''
def RGBprocess(raw_img):
    processed_observation = Image.fromarray(raw_img, 'RGB')
    processed_observation = processed_observation.convert('L')
    processed_observation = processed_observation.resize((84, 84))
    processed_observation = np.array(processed_observation)
    processed_observation = processed_observation.reshape(1, processed_observation.shape[0], processed_observation.shape[1], 1) #1x80x80x1
    return processed_observation
'''
def RGBprocess(raw_img):
    x_t = skimage.color.rgb2gray(raw_img) # Grayscale
    x_t = skimage.transform.resize(x_t, (110,84), mode='constant', preserve_range=True) # Downsample
    processed_observation = skimage.util.crop(x_t,((19,7),(0,0))) # Crop
    return processed_observation
'''
def RGBprocess(raw_img): 
        I = raw_img[35:195]
        I = I[::2, ::2, 0]
        I[I == 144] = 0
        I[I == 109] = 0
        I[I != 0] = 1
        processed_observation = I.astype(np.float32)
        return processed_observation

def RGBprocess(raw_img):
    grayscale_observation = raw_img.mean(2)
    resized_observation = misc.imresize(grayscale_observation, (80, 80)).astype(np.float32)
    processed_observation = resized_observation.reshape(1, resized_observation.shape[0], resized_observation.shape[1], 1)
    return processed_observation


def RGBprocess(raw_img):
    frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY)
    ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
    processed_observation = np.reshape(frame, (1, 84, 84, 1))
    return processed_observation
'''

batch_size = 32
#episodes = sys.argv[1] if len(sys.argv) > 1 else 5000
#env_name = sys.argv[2] if len(sys.argv) > 2 else "Pong-v0"

episodes = 1500
env_name = "Breakout-v0"
D = 84*84

env = gym.make(env_name)

env = wrappers.Monitor(env, env_name, force=True)

agent = Agent(env.observation_space.shape, env.action_space.n)

for i_episodes in range(episodes):
    State = env.reset()
    totalreward = 0
    new_game = True
    prev_x = None
    state = RGBprocess(State)
    state = agent.stack(state, new_game)
    short_mem = []
    done = False
    while not done:
        if i_episodes % 50 == 0:
            env.render()
        action = agent.act(state)
        new_state, reward, done, info = env.step(action)
        new_state = RGBprocess(new_state)
        #new_state_dif = new_state - prev_x if prev_x is not None else np.zeros((1, 84, 84, 1))
        #prev_x = new_state
        new_state_dif = agent.stack(new_state, new_game)
        agent.remember(state, action, reward, new_state_dif, done)
        state = new_state_dif
        totalreward += reward
        new_game = False
    agent.memory_replay(batch_size)
    if done:
        print("{} episode, score = {} ".format(i_episodes + 1, totalreward))
        agent.save_model()

env.close()
gym.upload(env_name, api_key='sk_WRCITkqmTJKYB9hvBk5tPA')

[2017-08-24 16:48:00,184] Making new env: Breakout-v0
[2017-08-24 16:48:00,442] Clearing 15 monitor files from previous run (because force=True was provided)
[2017-08-24 16:48:00,533] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Breakout-v0/openaigym.video.0.27707.video000000.mp4
[2017-08-24 16:48:06,289] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Breakout-v0/openaigym.video.0.27707.video000001.mp4


1 episode, score = 1.0 
2 episode, score = 6.0 
3 episode, score = 1.0 
4 episode, score = 0.0 
5 episode, score = 2.0 
6 episode, score = 3.0 
7 episode, score = 1.0 


[2017-08-24 16:48:14,783] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Breakout-v0/openaigym.video.0.27707.video000008.mp4


8 episode, score = 3.0 
9 episode, score = 1.0 
10 episode, score = 0.0 
11 episode, score = 4.0 
12 episode, score = 0.0 
13 episode, score = 0.0 
14 episode, score = 1.0 
15 episode, score = 3.0 
16 episode, score = 5.0 
17 episode, score = 3.0 
18 episode, score = 1.0 
19 episode, score = 1.0 
20 episode, score = 1.0 
21 episode, score = 2.0 
22 episode, score = 0.0 
23 episode, score = 2.0 
24 episode, score = 0.0 
25 episode, score = 3.0 
26 episode, score = 2.0 


[2017-08-24 16:48:35,395] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Breakout-v0/openaigym.video.0.27707.video000027.mp4


27 episode, score = 2.0 
28 episode, score = 2.0 
29 episode, score = 3.0 
30 episode, score = 0.0 
31 episode, score = 0.0 
32 episode, score = 1.0 
33 episode, score = 0.0 
34 episode, score = 1.0 
35 episode, score = 1.0 
36 episode, score = 2.0 
37 episode, score = 6.0 
38 episode, score = 0.0 
39 episode, score = 2.0 
40 episode, score = 2.0 
41 episode, score = 1.0 
42 episode, score = 1.0 
43 episode, score = 3.0 
44 episode, score = 1.0 
45 episode, score = 2.0 
46 episode, score = 2.0 
47 episode, score = 1.0 
48 episode, score = 1.0 
49 episode, score = 0.0 
50 episode, score = 1.0 
51 episode, score = 3.0 
52 episode, score = 2.0 
53 episode, score = 0.0 
54 episode, score = 1.0 
55 episode, score = 0.0 
56 episode, score = 2.0 
57 episode, score = 1.0 
58 episode, score = 3.0 
59 episode, score = 4.0 
60 episode, score = 1.0 
61 episode, score = 2.0 
62 episode, score = 1.0 
63 episode, score = 2.0 


[2017-08-24 16:49:20,843] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Breakout-v0/openaigym.video.0.27707.video000064.mp4


64 episode, score = 2.0 
65 episode, score = 0.0 
66 episode, score = 2.0 
67 episode, score = 3.0 
68 episode, score = 1.0 
69 episode, score = 2.0 
70 episode, score = 2.0 
71 episode, score = 0.0 
72 episode, score = 0.0 
73 episode, score = 0.0 
74 episode, score = 0.0 
75 episode, score = 0.0 
76 episode, score = 2.0 
77 episode, score = 2.0 
78 episode, score = 2.0 
79 episode, score = 4.0 
80 episode, score = 1.0 
81 episode, score = 0.0 
82 episode, score = 0.0 
83 episode, score = 4.0 
84 episode, score = 1.0 
85 episode, score = 0.0 
86 episode, score = 0.0 
87 episode, score = 3.0 
88 episode, score = 0.0 
89 episode, score = 2.0 
90 episode, score = 2.0 
91 episode, score = 0.0 
92 episode, score = 2.0 
93 episode, score = 4.0 
94 episode, score = 0.0 
95 episode, score = 5.0 
96 episode, score = 3.0 
97 episode, score = 2.0 
98 episode, score = 0.0 
99 episode, score = 1.0 
100 episode, score = 0.0 
101 episode, score = 1.0 
102 episode, score = 1.0 
103 episode, score = 2

[2017-08-24 16:50:48,636] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Breakout-v0/openaigym.video.0.27707.video000125.mp4


126 episode, score = 0.0 
127 episode, score = 1.0 
128 episode, score = 0.0 
129 episode, score = 0.0 
130 episode, score = 2.0 
131 episode, score = 0.0 
132 episode, score = 0.0 
133 episode, score = 2.0 
134 episode, score = 1.0 
135 episode, score = 1.0 
136 episode, score = 1.0 
137 episode, score = 3.0 
138 episode, score = 0.0 
139 episode, score = 2.0 
140 episode, score = 0.0 
141 episode, score = 1.0 
142 episode, score = 2.0 
143 episode, score = 0.0 
144 episode, score = 2.0 
145 episode, score = 2.0 
146 episode, score = 0.0 
147 episode, score = 1.0 
148 episode, score = 3.0 
149 episode, score = 2.0 
150 episode, score = 0.0 
151 episode, score = 3.0 
152 episode, score = 1.0 
153 episode, score = 0.0 
154 episode, score = 1.0 
155 episode, score = 0.0 
156 episode, score = 0.0 
157 episode, score = 2.0 
158 episode, score = 2.0 
159 episode, score = 2.0 
160 episode, score = 3.0 
161 episode, score = 3.0 
162 episode, score = 0.0 
163 episode, score = 0.0 
164 episode,

[2017-08-24 16:53:56,059] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Breakout-v0/openaigym.video.0.27707.video000216.mp4


216 episode, score = 2.0 


OSError: [Errno 12] Cannot allocate memory

In [None]:
import gym
from gym import wrappers
gym.upload(env_name, api_key='sk_WRCITkqmTJKYB9hvBk5tPA')

In [None]:
processed_observation.shape

In [None]:
reward

In [None]:
test = agent.model.predict(state)

In [None]:
test[0]

In [None]:
# env.reset()
# env.close()

In [None]:
len(agent.memory)

In [None]:
processed_observation = Image.fromarray(state, 'RGB')
processed_observation = processed_observation.convert('L')
processed_observation = processed_observation.resize((80, 80))
processed_observation = np.array(processed_observation)
processed_observation = processed_observation.reshape(processed_observation.shape[0], processed_observation.shape[1], 1)

In [None]:
new_state

In [None]:
def RGBprocess(new_state):
        processed_observation = Image.fromarray(new_state, 'RGB')
        processed_observation = processed_observation.convert('L')
        processed_observation = processed_observation.resize((80, 80))
        processed_observation = np.array(processed_observation)
        processed_observation = processed_observation.reshape(processed_observation.shape[0], processed_observation.shape[1], 1) #1x80x80x1
        return processed_observation

In [None]:
processed_stack

In [None]:
new_state_dif = new_state_dif.reshape(new_state_dif.shape[1], new_state_dif.shape[2])
img = Image.fromarray(new_state_dif, 'L')

In [None]:
#img = Image.fromarray(state, 'L')
img.show()

In [None]:
thing = [1,2,3,4,5,6]
for x in thing[-2:]:
    print(x)

In [None]:
        # way to adjust reward, think incorrect
        if reward == 0:
            num_actions += 1
            short_mem.append([state, action, reward, new_state_dif, done])
        else:
            num_actions += 1
            short_mem.append([state, action, reward, new_state_dif, done])
            if reward == -1.0:
                for m in short_mem:
                    m[2] = -1.0
                    agent.remember(m[0], m[1], m[2], m[3], m[4])
                num_actions = 0
            elif reward == 1.0:
                for m in short_mem:
                    m[2] = 1.0
                    agent.remember(m[0], m[1], m[2], m[3], m[4])
            short_mem = []

In [None]:
        # early attempt at increasing samples with positive reward
        winsample = [s for s in self.memory if s[2]== 1.0]
        #print(winsample)
        tuple(winsample)
        if len(winsample) > 4:
            Samplewin = random.sample(winsample, 4)
            Sample += Samplewin
        else:  
            Sample += winsample