In [1]:
import numpy as np
import random
from PIL import Image
from collections import deque
from keras.models import Sequential
from keras import initializers
from keras.layers import Dense, Conv2D, Dropout, Flatten
from keras.optimizers import Adam, Nadam, Adamax
sizes = (80,80,1)

class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=50000)
        self.gamma = 0.99   # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01  # exploration will not decay futher
        self.epsilon_decay = 0.000995
        self.learning_rate = 0.0001
        self.loss = 0
        self.model = self._build_model()
        self.weight_backup = 'model_weights.h5'

    def _build_model(self):
        model = Sequential()
        model.add(Conv2D(32, kernel_size=8, subsample=(4, 4), activation='relu', kernel_initializer='random_uniform', bias_initializer='zeros', padding='same', input_shape= sizes))#80*80*4
        model.add(Conv2D(64, kernel_size=4, subsample=(2, 2), activation='relu', padding='same'))
        model.add(Conv2D(64, kernel_size=3, subsample=(1, 1), activation='relu', padding='same'))
        model.add(Flatten())
        model.add(Dense(256, activation='relu'))
        model.add(Dense(self.action_size))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    
    def save_model(self):
            self.model.save(self.weight_backup)

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def remember(self, state, action, reward, new_state, done):
        if len(self.memory) >= 50000:
            self.memory.popleft()
            self.memory.append([state, action, reward, new_state, done])
        else:
            self.memory.append([state, action, reward, new_state, done])    

    def memory_replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        Sample = random.sample(self.memory, batch_size)
        
        winsample = [s for s in self.memory if s[2]== 1.0]
        #print(winsample)
        tuple(winsample)
        if len(winsample) > 1:
            Samplewin = random.sample(winsample, 1)
            Sample += Samplewin
        else:  
            Sample += winsample
        
        inputs = np.zeros((len(Sample), state.shape[1], state.shape[2], state.shape[3])) # minibatch input
        targets = np.zeros((inputs.shape[0], self.action_size))
        
        for i in range(0, len(Sample)):
            sample_state = Sample[i][0]
            sample_action = Sample[i][1]
            sample_reward = Sample[i][2]
            sample_new_state = Sample[i][3]
            sample_done = Sample[i][4]
            
            #xxx = sample_new_state.reshape(sample_new_state.shape[1], sample_new_state.shape[2])
            #img = Image.fromarray(xxx, 'L')
            #img.show()
            
            inputs[i:i+1] = sample_state # slice of inputs setting = to state
            
            targets[i] = self.model.predict(sample_state)
            future_reward = self.model.predict(sample_new_state)
            
            if sample_done:
                targets[i, sample_action] = sample_reward
            elif sample_reward == 1.0:
                targets[i, sample_action] = sample_reward
            elif sample_reward == -1.0:
                targets[i, sample_action] = sample_reward
            else:
                targets[i, sample_action] = sample_reward + self.gamma * np.amax(future_reward)
            print(sample_action, sample_reward, targets[i, sample_action])
        self.loss += self.model.train_on_batch(inputs, targets)
        print(self.loss)
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
    '''        
    def RGBprocess(new_state):
        processed_observation = Image.fromarray(new_state, 'RGB')
        processed_observation = processed_observation.convert('L')
        processed_observation = processed_observation.resize((80, 80))
        processed_observation = np.array(processed_observation)
        processed_observation = processed_observation.reshape(1, processed_observation.shape[0], processed_observation.shape[1], 1) #1x80x80x1
        return processed_observation
        #stack4.append(processed_observation)
        #if len(stack4) == 4:
            #stack_of_observation = np.stack((processed_observation, processed_observation, processed_observation, processed_observation), axis=2)
            #stack_of_observation = stack_of_observation.reshape(stack_of_observation.shape[0], stack_of_observation.shape[1], stack_of_observation.shape[3], stack_of_observation.shape[2])
            #print(stack_of_observation.shape)
    '''

Using TensorFlow backend.


In [2]:
import sys
import gym
from gym import wrappers
from scipy import misc
#import cv2


def RGBprocess(raw_img):
        processed_observation = Image.fromarray(raw_img, 'RGB')
        processed_observation = processed_observation.convert('L')
        processed_observation = processed_observation.resize((80, 80))
        processed_observation = np.array(processed_observation)
        processed_observation = processed_observation.reshape(1, processed_observation.shape[0], processed_observation.shape[1], 1) #1x80x80x1
        return processed_observation

'''
def RGBprocess(raw_img): 
        I = raw_img[35:195]
        I = I[::2, ::2, 0]
        I[I == 144] = 0
        I[I == 109] = 0
        I[I != 0] = 1
        processed_observation = I.astype(np.float32)
        return processed_observation

def RGBprocess(raw_img):
    grayscale_observation = raw_img.mean(2)
    resized_observation = misc.imresize(grayscale_observation, (80, 80)).astype(np.float32)
    processed_observation = resized_observation.reshape(1, resized_observation.shape[0], resized_observation.shape[1], 1)
    return processed_observation


def RGBprocess(raw_img):
    frame = cv2.cvtColor(cv2.resize(frame, (84, 84)), cv2.COLOR_BGR2GRAY)
    ret, frame = cv2.threshold(frame, 1, 255, cv2.THRESH_BINARY)
    processed_observation = np.reshape(frame, (1, 84, 84, 1))
    return processed_observation
'''

batch_size = 32
#episodes = sys.argv[1] if len(sys.argv) > 1 else 5000
#env_name = sys.argv[2] if len(sys.argv) > 2 else "Pong-v0"

episodes = 1300
env_name = "Pong-v0"
D = 80*80

env = gym.make(env_name)

env = wrappers.Monitor(env, env_name, force=True)

agent = Agent(env.observation_space.shape, env.action_space.n)

for i_episodes in range(episodes):
    State = env.reset()
    state = RGBprocess(State)
    totalreward = 0
    num_actions = 0
    prev_x = None
    short_mem = []
    done = False
    while not done:
        if i_episodes % 10 == 0:
            env.render()
        action = agent.act(state)
        new_state, reward, done, info = env.step(action)
        new_state = RGBprocess(new_state)
        new_state_dif = new_state - prev_x if prev_x is not None else np.zeros(D)
        prev_x = new_state
        agent.remember(state, action, reward, new_state_dif, done)
        state = new_state_dif
        totalreward += reward
    agent.memory_replay(batch_size)
    if done:
        print("{} episode, score = {} ".format(i_episodes + 1, totalreward))
        agent.save_model()

env.close()
gym.upload(env_name, api_key='sk_WRCITkqmTJKYB9hvBk5tPA')

[2017-08-23 16:53:19,460] Making new env: Pong-v0
[2017-08-23 16:53:19,753] Clearing 4 monitor files from previous run (because force=True was provided)
[2017-08-23 16:53:19,880] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.15964.video000000.mp4


2 0.0 0.519595491886
0 0.0 0.865375020504
3 0.0 0.750805047154
4 0.0 0.814879238605
0 0.0 0.0953205277026
0 0.0 0.279882856607
5 0.0 1.04835041642
4 0.0 0.283437741101
0 0.0 0.153322188556
2 -1.0 -1.0
0 0.0 1.0461328733
4 0.0 0.951453931332
4 0.0 0.494135966599
0 0.0 0.501550898552
1 0.0 0.650113070011
0 0.0 1.55178675771
3 0.0 0.456367218196
0 0.0 0.953810085654
0 0.0 0.75831099987
2 0.0 0.811630520225
2 0.0 1.29939435482
5 0.0 0.491780225337
3 0.0 0.550231752992
1 0.0 0.45570340097
5 0.0 1.68978343964
1 0.0 2.07471824169
0 0.0 0.69590893507
0 0.0 0.890539886355
4 0.0 0.705812702179
3 0.0 1.1998637259
1 0.0 1.03742497444
5 0.0 0.580094175339


[2017-08-23 16:53:28,385] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.15964.video000001.mp4


0.180232271552
1 episode, score = -21.0 
2 0.0 1.10895850182
3 0.0 1.26953269958
5 0.0 1.23870070696
5 0.0 0.932734869719
5 0.0 1.38919611812
2 0.0 0.466427535117
5 0.0 0.715940642953
4 0.0 0.617840972543
0 0.0 1.43064151525
4 0.0 1.10404379368
3 0.0 0.592653683424
1 0.0 0.737578977942
2 0.0 0.61380254209
2 0.0 0.988185308576
1 0.0 1.36153300524
1 0.0 0.515487785339
4 0.0 0.773358723521
2 0.0 1.36552293062
2 0.0 0.609759036899
1 0.0 1.47447168589
1 0.0 1.15361703515
2 0.0 1.78625211239
0 0.0 1.3562572825
0 0.0 0.57189498961
2 0.0 0.850919979215
3 0.0 1.30622743249
3 0.0 1.24606415391
5 0.0 0.796367769241
3 0.0 0.15927006349
0 0.0 0.56271997869
1 0.0 0.62621281743
4 0.0 1.14605649948
0.289397053421
2 episode, score = -21.0 
3 0.0 0.00172587887384
0 0.0 0.788363547921
2 0.0 0.637856217027
2 0.0 0.910382471681
5 0.0 0.813238504529
0 0.0 0.916659098268
2 0.0 1.57575699449
4 0.0 0.499851509929
4 0.0 1.01028113723
0 0.0 1.37849785924
0 0.0 0.769459789395
5 -1.0 -1.0
4 0.0 1.66557823062
1 0.0

[2017-08-23 16:53:40,839] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.15964.video000008.mp4


0.98898678273
8 episode, score = -21.0 
5 0.0 2.65270297766
0 0.0 2.06463296413
4 0.0 2.06066144943
5 0.0 0.0100579759385
2 0.0 0.926807810068
4 0.0 2.10572678804
5 0.0 1.93523041248
2 0.0 1.24377580047
1 0.0 2.8750058341
4 0.0 1.31719571471
4 0.0 2.96732856274
2 0.0 2.50178093433
3 0.0 2.76514150143
5 0.0 2.62265060663
0 0.0 0.247455492765
1 0.0 1.47744666338
3 0.0 3.01777572155
1 0.0 2.960707798
4 0.0 1.87163259745
2 -1.0 -1.0
2 0.0 2.0636609745
2 0.0 2.03334872961
4 0.0 1.24377580047
4 0.0 2.69857460976
4 0.0 1.94754704118
1 0.0 1.48787183046
5 0.0 1.82452733159
3 0.0 2.14753367186
0 0.0 2.81785907507
4 0.0 2.74982995033
1 0.0 2.81757795811
3 0.0 1.63673529983
1 1.0 1.0
1.17196273059
9 episode, score = -21.0 
3 -1.0 -1.0
3 0.0 1.06484379172
3 0.0 1.24645361066
0 0.0 1.06285720825
3 0.0 2.78553935766
4 0.0 1.23291101933
1 0.0 2.09609516859
0 0.0 1.29693971515
0 0.0 1.90381034017
0 0.0 2.4184758997
1 0.0 1.86763771534
4 0.0 1.3717240262
2 0.0 3.45320016861
1 0.0 4.08509067535
5 0.0 1.

ValueError: Error when checking : expected conv2d_1_input to have 4 dimensions, but got array with shape (6400, 1)

In [None]:
import gym
from gym import wrappers
gym.upload(env_name, api_key='sk_WRCITkqmTJKYB9hvBk5tPA')

In [None]:
processed_observation.shape

In [None]:
reward

In [None]:
test = agent.model.predict(state)

In [None]:
test[0]

In [None]:
env.reset()
env.close()

In [None]:
len(agent.memory)

In [None]:
processed_observation = Image.fromarray(state, 'RGB')
processed_observation = processed_observation.convert('L')
processed_observation = processed_observation.resize((80, 80))
processed_observation = np.array(processed_observation)
processed_observation = processed_observation.reshape(processed_observation.shape[0], processed_observation.shape[1], 1)

In [None]:
new_state

In [None]:
def RGBprocess(new_state):
        processed_observation = Image.fromarray(new_state, 'RGB')
        processed_observation = processed_observation.convert('L')
        processed_observation = processed_observation.resize((80, 80))
        processed_observation = np.array(processed_observation)
        processed_observation = processed_observation.reshape(processed_observation.shape[0], processed_observation.shape[1], 1) #1x80x80x1
        return processed_observation

In [None]:
new_state_dif.shape

In [None]:
new_state_dif = new_state_dif.reshape(new_state_dif.shape[0], new_state_dif.shape[1])
img = Image.fromarray(new_state_dif, 'L')

In [None]:
#img = Image.fromarray(state, 'L')
img.show()

In [None]:
thing = [1,2,3,4,5,6]
for x in thing[-2:]:
    print(x)

In [None]:
        # way to adjust reward, think incorrect
        if reward == 0:
            num_actions += 1
            short_mem.append([state, action, reward, new_state_dif, done])
        else:
            num_actions += 1
            short_mem.append([state, action, reward, new_state_dif, done])
            if reward == -1.0:
                for m in short_mem:
                    m[2] = -1.0
                    agent.remember(m[0], m[1], m[2], m[3], m[4])
                num_actions = 0
            elif reward == 1.0:
                for m in short_mem:
                    m[2] = 1.0
                    agent.remember(m[0], m[1], m[2], m[3], m[4])
            short_mem = []

In [None]:
        # early attempt at increasing samples with positive reward
        winsample = [s for s in self.memory if s[2]== 1.0]
        #print(winsample)
        tuple(winsample)
        if len(winsample) > 4:
            Samplewin = random.sample(winsample, 4)
            Sample += Samplewin
        else:  
            Sample += winsample