In [1]:
import numpy as np
import random
from PIL import Image
from collections import deque
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Dropout, Flatten
from keras.optimizers import Adam, Nadam, Adamax
sizes = (80,80,1)

class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95   # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.00  # exploration will not decay futher
        self.epsilon_decay = 0.000995
        self.learning_rate = 0.001
        self.model = self._build_model()
        self.weight_backup = 'model_weights.h5'

    def _build_model(self):
        model = Sequential()
        model.add(Conv2D(32, kernel_size=8, subsample=(4, 4), activation='relu', padding='same', input_shape= sizes))#80*80*4
        model.add(Conv2D(64, kernel_size=4, subsample=(2, 2), activation='relu', padding='same'))
        model.add(Conv2D(64, kernel_size=3, subsample=(1, 1), activation='relu', padding='same'))
        model.add(Flatten())
        model.add(Dense(512, activation='relu'))
        model.add(Dense(2, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    '''
    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(20, input_dim=self.state_size, activation='elu'))
        #model.add(Dropout(0.1))
        model.add(Dense(20, activation='elu'))
        #model.add(Dropout(0.2))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Nadam(lr=self.learning_rate))
        return model
    '''
    
    def save_model(self):
            self.model.save(self.weight_backup)

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def remember(self, state, action, reward, new_state, done):
        self.memory.append((state, action, reward, new_state, done))

    def memory_replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        Sample = random.sample(self.memory, batch_size)
        for state, action, reward, new_state, done in Sample:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.model.predict(new_state))
            target_f = self.model.predict(state)
            target_f[0] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
    '''        
    def RGBprocess(new_state):
        processed_observation = Image.fromarray(new_state, 'RGB')
        processed_observation = processed_observation.convert('L')
        processed_observation = processed_observation.resize((80, 80))
        processed_observation = np.array(processed_observation)
        processed_observation = processed_observation.reshape(1, processed_observation.shape[0], processed_observation.shape[1], 1) #1x80x80x1
        return processed_observation
        #stack4.append(processed_observation)
        #if len(stack4) == 4:
            #stack_of_observation = np.stack((processed_observation, processed_observation, processed_observation, processed_observation), axis=2)
            #stack_of_observation = stack_of_observation.reshape(stack_of_observation.shape[0], stack_of_observation.shape[1], stack_of_observation.shape[3], stack_of_observation.shape[2])
            #print(stack_of_observation.shape)
    '''

Using TensorFlow backend.


In [2]:
import sys
import gym
from gym import wrappers

def RGBprocess(raw_img):
        processed_observation = Image.fromarray(raw_img, 'RGB')
        processed_observation = processed_observation.convert('L')
        processed_observation = processed_observation.resize((80, 80))
        processed_observation = np.array(processed_observation)
        processed_observation = processed_observation.reshape(1, processed_observation.shape[0], processed_observation.shape[1], 1) #1x80x80x1
        return processed_observation

batch_size = 32
#episodes = sys.argv[1] if len(sys.argv) > 1 else 5000
#env_name = sys.argv[2] if len(sys.argv) > 2 else "Pong-v0"

episodes = 1500
env_name = "Pong-v0"

env = gym.make(env_name)

env = wrappers.Monitor(env, env_name, force=True)

agent = Agent(env.observation_space.shape, env.action_space.n)

for i_episodes in range(episodes):
    state = env.reset()
    index = 0
    done = False
    state = RGBprocess(state)
    while not done:
        env.render()
        action = agent.act(state)
        new_state, reward, done, info = env.step(action)
        new_state = RGBprocess(new_state)
        agent.remember(state, action, reward, new_state, done)
        state = new_state
        index += reward
    agent.memory_replay(batch_size)
    if done:
        print("{} episode, score = {} ".format(i_episodes + 1, index + 1))
        agent.save_model()

env.close()
gym.upload(env_name, api_key='sk_WRCITkqmTJKYB9hvBk5tPA')

[2017-08-16 22:10:38,055] Making new env: Pong-v0
[2017-08-16 22:10:38,239] Clearing 8 monitor files from previous run (because force=True was provided)
[2017-08-16 22:10:38,345] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.24715.video000000.mp4
[2017-08-16 22:11:02,878] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.24715.video000001.mp4


1 episode, score = -18.0 
2 episode, score = -16.0 
3 episode, score = -20.0 
4 episode, score = -20.0 
5 episode, score = -20.0 
6 episode, score = -20.0 
7 episode, score = -19.0 


[2017-08-16 22:13:36,128] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.24715.video000008.mp4


8 episode, score = -19.0 
9 episode, score = -20.0 
10 episode, score = -18.0 
11 episode, score = -18.0 
12 episode, score = -20.0 
13 episode, score = -19.0 
14 episode, score = -20.0 
15 episode, score = -19.0 
16 episode, score = -19.0 
17 episode, score = -20.0 
18 episode, score = -19.0 
19 episode, score = -18.0 
20 episode, score = -17.0 
21 episode, score = -19.0 
22 episode, score = -17.0 
23 episode, score = -19.0 
24 episode, score = -20.0 
25 episode, score = -19.0 
26 episode, score = -19.0 


[2017-08-16 22:20:56,732] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.24715.video000027.mp4


27 episode, score = -20.0 
28 episode, score = -19.0 
29 episode, score = -19.0 
30 episode, score = -20.0 
31 episode, score = -20.0 
32 episode, score = -20.0 
33 episode, score = -20.0 
34 episode, score = -19.0 
35 episode, score = -19.0 
36 episode, score = -20.0 
37 episode, score = -20.0 
38 episode, score = -17.0 
39 episode, score = -20.0 
40 episode, score = -20.0 
41 episode, score = -18.0 
42 episode, score = -20.0 
43 episode, score = -18.0 
44 episode, score = -20.0 
45 episode, score = -20.0 
46 episode, score = -19.0 
47 episode, score = -20.0 
48 episode, score = -20.0 
49 episode, score = -19.0 
50 episode, score = -20.0 
51 episode, score = -19.0 
52 episode, score = -20.0 
53 episode, score = -16.0 
54 episode, score = -17.0 
55 episode, score = -19.0 
56 episode, score = -19.0 
57 episode, score = -20.0 
58 episode, score = -19.0 
59 episode, score = -19.0 
60 episode, score = -20.0 
61 episode, score = -18.0 
62 episode, score = -20.0 
63 episode, score = -20.0 


[2017-08-16 22:34:23,238] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.24715.video000064.mp4


64 episode, score = -20.0 
65 episode, score = -18.0 
66 episode, score = -20.0 
67 episode, score = -20.0 
68 episode, score = -18.0 
69 episode, score = -20.0 
70 episode, score = -19.0 
71 episode, score = -19.0 
72 episode, score = -20.0 
73 episode, score = -17.0 
74 episode, score = -19.0 
75 episode, score = -20.0 
76 episode, score = -20.0 
77 episode, score = -20.0 
78 episode, score = -18.0 
79 episode, score = -20.0 
80 episode, score = -18.0 
81 episode, score = -19.0 
82 episode, score = -20.0 
83 episode, score = -19.0 
84 episode, score = -20.0 
85 episode, score = -19.0 
86 episode, score = -20.0 
87 episode, score = -20.0 
88 episode, score = -18.0 
89 episode, score = -20.0 
90 episode, score = -20.0 
91 episode, score = -19.0 
92 episode, score = -19.0 
93 episode, score = -19.0 
94 episode, score = -18.0 
95 episode, score = -19.0 
96 episode, score = -19.0 
97 episode, score = -20.0 
98 episode, score = -20.0 
99 episode, score = -18.0 
100 episode, score = -20.0 


[2017-08-17 06:57:55,951] Starting new video recorder writing to /home/z0m6ie/Documents/GitHub/Machine_Learning_Projects/deep-Q-learning/Test/Pong-v0/openaigym.video.0.24715.video000125.mp4


125 episode, score = -20.0 
126 episode, score = -20.0 
127 episode, score = -20.0 
128 episode, score = -19.0 
129 episode, score = -20.0 
130 episode, score = -20.0 
131 episode, score = -20.0 
132 episode, score = -19.0 
133 episode, score = -19.0 
134 episode, score = -18.0 
135 episode, score = -20.0 
136 episode, score = -18.0 
137 episode, score = -18.0 
138 episode, score = -17.0 
139 episode, score = -20.0 
140 episode, score = -19.0 
141 episode, score = -20.0 
142 episode, score = -20.0 
143 episode, score = -20.0 
144 episode, score = -20.0 
145 episode, score = -19.0 
146 episode, score = -19.0 
147 episode, score = -20.0 
148 episode, score = -20.0 
149 episode, score = -20.0 
150 episode, score = -19.0 
151 episode, score = -20.0 
152 episode, score = -20.0 
153 episode, score = -20.0 
154 episode, score = -19.0 
155 episode, score = -20.0 
156 episode, score = -18.0 
157 episode, score = -20.0 
158 episode, score = -20.0 
159 episode, score = -20.0 
160 episode, score =

KeyboardInterrupt: 

In [None]:
env.observation_space.shape

In [None]:
state.shape

In [None]:
info

In [None]:
test = agent.model.predict(state)

In [None]:
test[0]

In [None]:
env.reset()
env.close()

In [None]:
len(agent.memory)

In [None]:
processed_observation = Image.fromarray(state, 'RGB')
processed_observation = processed_observation.convert('L')
processed_observation = processed_observation.resize((80, 80))
processed_observation = np.array(processed_observation)
processed_observation = processed_observation.reshape(processed_observation.shape[0], processed_observation.shape[1], 1)

In [None]:
new_state

In [None]:
def RGBprocess(new_state):
        processed_observation = Image.fromarray(new_state, 'RGB')
        processed_observation = processed_observation.convert('L')
        processed_observation = processed_observation.resize((80, 80))
        processed_observation = np.array(processed_observation)
        processed_observation = processed_observation.reshape(processed_observation.shape[0], processed_observation.shape[1], 1) #1x80x80x1
        return processed_observation

In [None]:
RGBprocess(state)

In [None]:
img.show()