In [3]:
import numpy as np
import random
from PIL import Image
from collections import deque
from keras.models import Sequential
from keras import initializers
from keras.layers import Dense, Conv2D, Dropout, Flatten
from keras.optimizers import Adam, Nadam, Adamax
sizes = (84,84,4)

class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=200000)
        self.gamma = 0.99   # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01  # exploration will not decay futher
        self.epsilon_decay = 0.0000198
        self.learning_rate = 0.0001
        self.loss = 0
        self.model = self._build_model()
        self.weight_backup = 'model_weights.h5'
        self.old_I_2 = None
        self.old_I_3 = None
        self.old_I_4 = None
        self.old_I_1 = None

    def _build_model(self):
        model = Sequential()
        model.add(Conv2D(32, kernel_size=8, subsample=(4, 4), activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', padding='same', input_shape= sizes))#80*80*4
        model.add(Conv2D(64, kernel_size=4, subsample=(2, 2), activation='relu', padding='same'))
        model.add(Conv2D(64, kernel_size=3, subsample=(1, 1), activation='relu', padding='same'))
        model.add(Flatten())
        model.add(Dense(256, activation='relu'))
        model.add(Dense(self.action_size))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        print(model.summary())
        return model

    
    def save_model(self):
            self.model.save(self.weight_backup)

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
    
    def stack(self, processed_observation):
        I_4 = self.old_I_3 if self.old_I_3 is not None else np.zeros((1, 84, 84))
        I_3 = self.old_I_2 if self.old_I_2 is not None else np.zeros((1, 84, 84))
        I_2 = self.old_I_1 if self.old_I_1 is not None else np.zeros((1, 84, 84))
        I_1 = processed_observation
        processed_stack = np.stack((I_4, I_3, I_2, I_1), axis = 3)
        #print(processed_stack.shape)
        self.old_I_4 = I_4
        self.old_I_3 = I_3
        self.old_I_2 = I_2
        self.old_I_1 = I_1
        return processed_stack

    def remember(self, state, action, reward, new_state, done):
        if len(self.memory) >= 200000:
            self.memory.popleft()
            self.memory.append([state, action, reward, new_state, done])
        else:
            self.memory.append([state, action, reward, new_state, done])    

    def memory_replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        Sample = random.sample(self.memory, batch_size)
        for state, action, reward, new_state, done in Sample:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.model.predict(new_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            print(action, reward, target_f[0][action])
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay    
        
        '''
        if len(self.memory) < batch_size:
            return
        Sample = random.sample(self.memory, batch_size)
        '''

        '''
        inputs = np.zeros((len(Sample), state.shape[1], state.shape[2], state.shape[3])) # minibatch input
        targets = np.zeros((inputs.shape[0], self.action_size))
        
        for i in range(0, len(Sample)):
            sample_state = Sample[i][0]
            sample_action = Sample[i][1]
            sample_reward = Sample[i][2]
            sample_new_state = Sample[i][3]
            sample_done = Sample[i][4]
            
#             xxx = sample_new_state.reshape(sample_new_state.shape[1], sample_new_state.shape[2])
#             img = Image.fromarray(xxx, 'L')
#             img.show()
            
            inputs[i:i+1] = sample_state # slice of inputs setting = to state
            
            targets[i] = self.model.predict(sample_state)
            future_reward = self.model.predict(sample_new_state)
            
            if sample_done:
                targets[i, sample_action] = sample_reward
            #elif sample_reward == 1.0:
                #targets[i, sample_action] = sample_reward
            #elif sample_reward == -1.0:
                #targets[i, sample_action] = sample_reward
            else:
                targets[i, sample_action] = sample_reward + self.gamma * np.max(future_reward)
            print(sample_action, sample_reward, targets[i, sample_action])
        self.loss += self.model.train_on_batch(inputs, targets)
        print(self.loss)
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
            '''
    '''        
    def RGBprocess(new_state):
        processed_observation = Image.fromarray(new_state, 'RGB')
        processed_observation = processed_observation.convert('L')
        processed_observation = processed_observation.resize((80, 80))
        processed_observation = np.array(processed_observation)
        processed_observation = processed_observation.reshape(1, processed_observation.shape[0], processed_observation.shape[1], 1) #1x80x80x1
        return processed_observation
        #stack4.append(processed_observation)
        #if len(stack4) == 4:
            #stack_of_observation = np.stack((processed_observation, processed_observation, processed_observation, processed_observation), axis=2)
            #stack_of_observation = stack_of_observation.reshape(stack_of_observation.shape[0], stack_of_observation.shape[1], stack_of_observation.shape[3], stack_of_observation.shape[2])
            #print(stack_of_observation.shape)
    '''

In [4]:
import sys
import gym
from gym import wrappers
from scipy import misc
#import cv2


def RGBprocess(raw_img):
        processed_observation = Image.fromarray(raw_img, 'RGB')
        processed_observation = processed_observation.convert('L')
        processed_observation = processed_observation.resize((84, 84))
        processed_observation = np.array(processed_observation)
        processed_observation = processed_observation.reshape(1, processed_observation.shape[0], processed_observation.shape[1]) #1x80x80x1
        return processed_observation

'''
def RGBprocess(raw_img): 
        I = raw_img[35:195]
        I = I[::2, ::2, 0]
        I[I == 144] = 0
        I[I == 109] = 0
        I[I != 0] = 1
        processed_observation = I.astype(np.float32)
        return processed_observation

def RGBprocess(raw_img):
    grayscale_observation = raw_img.mean(2)
    resized_observation = misc.imresize(grayscale_observation, (80, 80)).astype(np.float32)
    processed_observation = resized_observation.reshape(1, resized_observation.shape[0], resized_observation.shape[1], 1)
    return processed_observation


def RGBprocess(raw_img):
    frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY)
    ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
    processed_observation = np.reshape(frame, (1, 84, 84, 1))
    return processed_observation
'''

batch_size = 32
#episodes = sys.argv[1] if len(sys.argv) > 1 else 5000
#env_name = sys.argv[2] if len(sys.argv) > 2 else "Pong-v0"

episodes = 1
env_name = "Breakout-v0"
D = 80*80

env = gym.make(env_name)

env = wrappers.Monitor(env, env_name, force=True)

agent = Agent(env.observation_space.shape, env.action_space.n)

for i_episodes in range(episodes):
    State = env.reset()
    state = RGBprocess(State)
    state = agent.stack(state)
    totalreward = 0
    num_actions = 0
    prev_x = None
    short_mem = []
    done = False
    while not done:
        #if i_episodes % 50 == 0:
            #env.render()
        action = agent.act(state)
        new_state, reward, done, info = env.step(action)
#         xxx = new_state
#         img = Image.fromarray(xxx, 'RGB')
#         img.show()
        new_state = RGBprocess(new_state)
#         xxx = new_state.reshape(new_state.shape[1], new_state.shape[2])
#         img = Image.fromarray(xxx, 'L')
#         img.show()
        #new_state_dif = new_state - prev_x if prev_x is not None else np.zeros((1, 80, 80, 1))
        #prev_x = new_state
        new_state_dif = agent.stack(new_state)
        agent.remember(state, action, reward, new_state_dif, done)
        state = new_state_dif
        totalreward += reward
    agent.memory_replay(batch_size)
    if done:
        print("{} episode, score = {} ".format(i_episodes + 1, totalreward))
        agent.save_model()

env.close()
gym.upload(env_name, api_key='sk_WRCITkqmTJKYB9hvBk5tPA')

[2017-09-01 11:58:47,851] Making new env: Breakout-v0
[2017-09-01 11:58:48,032] Clearing 4 monitor files from previous run (because force=True was provided)
[2017-09-01 11:58:48,186] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Breakout-v0/openaigym.video.1.11001.video000000.mp4


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_4 (Conv2D)            (None, 21, 21, 32)        8224      
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 11, 11, 64)        32832     
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 11, 11, 64)        36928     
_________________________________________________________________
flatten_2 (Flatten)          (None, 7744)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               1982720   
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 1028      
Total params: 2,061,732
Trainable params: 2,061,732
Non-trainable params: 0
_________________________________________________________________


[2017-09-01 11:58:49,896] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Breakout-v0')
[2017-09-01 11:58:49,897] [Breakout-v0] Uploading 1 episodes of training data


3 0.0 222.87
3 0.0 241.704
3 0.0 261.945
1 episode, score = 0.0 


[2017-09-01 11:58:50,920] [Breakout-v0] Uploading videos of 1 training episodes (10736 bytes)
[2017-09-01 11:58:51,609] [Breakout-v0] Creating evaluation object from Breakout-v0 with learning curve and training video
[2017-09-01 11:58:52,100] 
****************************************************
You successfully uploaded your evaluation on Breakout-v0 to
OpenAI Gym! You can find it at:

    https://gym.openai.com/evaluations/eval_pwl4hEjnQS627VRWy8OMQ

****************************************************


In [None]:
import gym
from gym import wrappers
gym.upload(env_name, api_key='sk_WRCITkqmTJKYB9hvBk5tPA')

In [None]:
processed_observation.shape

In [None]:
agent.model.predict(sample_new_state)

In [None]:
agent.model.predict(state)

In [None]:
test[0]

In [None]:
env.reset()
env.close()

In [None]:
len(agent.memory)

In [None]:
processed_observation = Image.fromarray(state, 'RGB')
processed_observation = processed_observation.convert('L')
processed_observation = processed_observation.resize((80, 80))
processed_observation = np.array(processed_observation)
processed_observation = processed_observation.reshape(processed_observation.shape[0], processed_observation.shape[1], 1)

In [None]:
new_state

In [None]:
def RGBprocess(new_state):
        processed_observation = Image.fromarray(new_state, 'RGB')
        processed_observation = processed_observation.convert('L')
        processed_observation = processed_observation.resize((80, 80))
        processed_observation = np.array(processed_observation)
        processed_observation = processed_observation.reshape(processed_observation.shape[0], processed_observation.shape[1], 1) #1x80x80x1
        return processed_observation

In [None]:
new_state_dif.shape

In [None]:
new_state_dif = new_state_dif.reshape(new_state_dif.shape[1], new_state_dif.shape[2])
img = Image.fromarray(new_state_dif, 'L')

In [None]:
#img = Image.fromarray(state, 'L')
img.show()

In [None]:
thing = [1,2,3,4,5,6]
for x in thing[-2:]:
    print(x)

In [None]:
        # way to adjust reward, think incorrect
        if reward == 0:
            num_actions += 1
            short_mem.append([state, action, reward, new_state_dif, done])
        else:
            num_actions += 1
            short_mem.append([state, action, reward, new_state_dif, done])
            if reward == -1.0:
                for m in short_mem:
                    m[2] = -1.0
                    agent.remember(m[0], m[1], m[2], m[3], m[4])
                num_actions = 0
            elif reward == 1.0:
                for m in short_mem:
                    m[2] = 1.0
                    agent.remember(m[0], m[1], m[2], m[3], m[4])
            short_mem = []

In [None]:
        # early attempt at increasing samples with positive reward
        winsample = [s for s in self.memory if s[2]== 1.0]
        #print(winsample)
        tuple(winsample)
        if len(winsample) > 4:
            Samplewin = random.sample(winsample, 4)
            Sample += Samplewin
        else:  
            Sample += winsample