In [9]:
import numpy as np
import random
from PIL import Image
from collections import deque
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Dropout, Flatten
from keras.optimizers import Adam, Nadam, Adamax
sizes = (80,80,1)

class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=50000)
        self.gamma = 0.999   # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.00  # exploration will not decay futher
        self.epsilon_decay = 0.000995
        self.learning_rate = 0.001
        self.model = self._build_model()
        self.weight_backup = 'model_weights.h5'

    def _build_model(self):
        model = Sequential()
        model.add(Conv2D(32, kernel_size=8, subsample=(4, 4), activation='relu', padding='same', input_shape= sizes))#80*80*4
        model.add(Conv2D(64, kernel_size=4, subsample=(2, 2), activation='relu', padding='same'))
        model.add(Conv2D(64, kernel_size=3, subsample=(1, 1), activation='relu', padding='same'))
        model.add(Flatten())
        model.add(Dense(512, activation='relu'))
        model.add(Dense(self.action_size))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    '''
    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(20, input_dim=self.state_size, activation='elu'))
        #model.add(Dropout(0.1))
        model.add(Dense(20, activation='elu'))
        #model.add(Dropout(0.2))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Nadam(lr=self.learning_rate))
        return model
    '''
    
    def save_model(self):
            self.model.save(self.weight_backup)

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def remember(self, state, action, reward, new_state, done, num_actions):
        self.memory.append((state, action, reward, new_state, done, num_actions))
        if self.memory[-1][2] == 1.0:
            renumber = -self.memory[-1][5]
            for w in self.memory[renumber]:
                print(w[2])
                w[2] = 1.0
        elif self.memory[-1][2] == -1.0:
            renumber = -self.memory[-1][5]
            for w in self.memory[renumber]:
                w[2] = 1.0

    def memory_replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        Sample = random.sample(self.memory, batch_size)
        winsample = [s for s in self.memory if s[2]== 1.0]
        #print(winsample)
        tuple(winsample)
        if len(winsample) > batch_size:
            Samplewin = random.sample(winsample, batch_size)
            Sample += Samplewin
        else:  
            Sample += winsample
        for state, action, reward, new_state, done in Sample:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.model.predict(new_state))
            target_f = self.model.predict(state)
            print(action, reward, target_f)
            target_f[0] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
    '''        
    def RGBprocess(new_state):
        processed_observation = Image.fromarray(new_state, 'RGB')
        processed_observation = processed_observation.convert('L')
        processed_observation = processed_observation.resize((80, 80))
        processed_observation = np.array(processed_observation)
        processed_observation = processed_observation.reshape(1, processed_observation.shape[0], processed_observation.shape[1], 1) #1x80x80x1
        return processed_observation
        #stack4.append(processed_observation)
        #if len(stack4) == 4:
            #stack_of_observation = np.stack((processed_observation, processed_observation, processed_observation, processed_observation), axis=2)
            #stack_of_observation = stack_of_observation.reshape(stack_of_observation.shape[0], stack_of_observation.shape[1], stack_of_observation.shape[3], stack_of_observation.shape[2])
            #print(stack_of_observation.shape)
    '''

In [10]:
import sys
import gym
from gym import wrappers

def RGBprocess(raw_img):
        processed_observation = Image.fromarray(raw_img, 'RGB')
        processed_observation = processed_observation.convert('L')
        processed_observation = processed_observation.resize((80, 80))
        processed_observation = np.array(processed_observation)
        processed_observation = processed_observation.reshape(1, processed_observation.shape[0], processed_observation.shape[1], 1) #1x80x80x1
        return processed_observation

batch_size = 16
#episodes = sys.argv[1] if len(sys.argv) > 1 else 5000
#env_name = sys.argv[2] if len(sys.argv) > 2 else "Pong-v0"

episodes = 1500
env_name = "Pong-v0"

env = gym.make(env_name)

env = wrappers.Monitor(env, env_name, force=True)

agent = Agent(env.observation_space.shape, env.action_space.n)

for i_episodes in range(episodes):
    state = env.reset()
    totalreward = 0
    num_actions = 0
    done = False
    state = RGBprocess(state)
    while not done:
        #env.render()
        action = agent.act(state)
        new_state, reward, done, info = env.step(action)
        new_state = RGBprocess(new_state)
        if reward == 0:
            num_actions += 1
            agent.remember(state, action, reward, new_state, done, num_actions)
        else:
            num_actions += 1
            agent.remember(state, action, reward, new_state, done, num_actions)
            num_actions = 0
        state = new_state
        totalreward += reward
    agent.memory_replay(batch_size)
    if done:
        print("{} episode, score = {} ".format(i_episodes + 1, totalreward))
        agent.save_model()

env.close()
gym.upload(env_name, api_key='sk_WRCITkqmTJKYB9hvBk5tPA')

[2017-08-17 23:04:02,910] Making new env: Pong-v0
[2017-08-17 23:04:03,116] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0')
[2017-08-17 23:04:03,119] Clearing 4 monitor files from previous run (because force=True was provided)
[2017-08-17 23:04:03,213] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.4.10592.video000000.mp4


IndexError: index 2 is out of bounds for axis 0 with size 1

In [None]:
env.observation_space.shape

In [8]:
state.shape

(1, 80, 80, 1)

In [None]:
reward

In [3]:
test = agent.model.predict(state)

In [4]:
test[0]

array([ nan,  nan,  nan,  nan,  nan,  nan], dtype=float32)

In [None]:
env.reset()
env.close()

In [None]:
len(agent.memory)

In [None]:
processed_observation = Image.fromarray(state, 'RGB')
processed_observation = processed_observation.convert('L')
processed_observation = processed_observation.resize((80, 80))
processed_observation = np.array(processed_observation)
processed_observation = processed_observation.reshape(processed_observation.shape[0], processed_observation.shape[1], 1)

In [6]:
new_state

array([[[[ 87],
         [ 87],
         [ 87],
         ..., 
         [ 87],
         [ 87],
         [ 87]],

        [[ 87],
         [ 87],
         [ 87],
         ..., 
         [ 87],
         [ 87],
         [ 87]],

        [[ 87],
         [ 87],
         [ 87],
         ..., 
         [ 87],
         [ 87],
         [ 87]],

        ..., 
        [[236],
         [236],
         [236],
         ..., 
         [236],
         [236],
         [236]],

        [[236],
         [236],
         [236],
         ..., 
         [236],
         [236],
         [236]],

        [[236],
         [236],
         [236],
         ..., 
         [236],
         [236],
         [236]]]], dtype=uint8)

In [None]:
def RGBprocess(new_state):
        processed_observation = Image.fromarray(new_state, 'RGB')
        processed_observation = processed_observation.convert('L')
        processed_observation = processed_observation.resize((80, 80))
        processed_observation = np.array(processed_observation)
        processed_observation = processed_observation.reshape(processed_observation.shape[0], processed_observation.shape[1], 1) #1x80x80x1
        return processed_observation

In [5]:
RGBprocess(state)

ValueError: Too many dimensions: 4 > 3.

In [None]:
img.show()