
CNN and dqn generation


https://www.intelnervana.com/demystifying-deep-reinforcement-learning/

In [136]:
from ple.games.flappybird import FlappyBird
from ple import PLE
import random
import numpy as np
import pickle
from classes import StackedImages, ImageProcessor
from IPython.display import clear_output


from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.optimizers import SGD
from keras.models import load_model


from collections import deque

import timeit

from skimage import io
import matplotlib.pyplot as plt

#down-sized dimensions of the state representations
input_num_rows = 96
input_num_cols = 114

data_input = '/users/momori/data/stacked_images.pkl'

## Image Processing with CNN (Convolutional Neural Network)

input shape is (4,288, 512)

In [4]:
# '''simple convolutional neural network'''
# model = Sequential()

# model.add(Conv2D(6, (8,8), activation='relu', input_shape=(288, 512, 4)))
# model.add(MaxPooling2D(pool_size=(4,4)))

# model.add(Flatten())
# model.add(Dense(10, activation='relu'))
# model.add(Dense(2, activation='softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='sgd')

In [144]:
def update_reward(value):
    if value==0.0:
        return 0.001
    elif value==5.0:
        return 20.0
    elif value==-5.0:
        return -5.0

def test_game(agent):
    #init the game flappybird
    game = FlappyBird()

    #set up the screen
    p = PLE(game, fps=30, display_screen=True)

    p.init()

    observations = deque()

    max_frames = 100 ##first frame is null
    num_frame_stacks = 4
    num_to_observe = 12 #observations before we train the network
    batch_count = 4 #train the model by selecting some subsample of the stored replays
                    #contains lists of replay envs. each element is in the format
                    #[state, action, reward, new_state, terminal(boolean)]
    epsilon = 0.8 #when to choose the known best action vs random action
    gamma = 0.8 #importance of future rewards compared to current reward
    
    total_reward = 0

    replay_data = deque() 

    game_over = False
    for i in range(0, max_frames):
        observation = p.getScreenRGB()
        if np.max(observation) != 0:
            observations.append(observation)


        ##after num_frame_stacks frames pass, we start predicting actions
        if len(observations) == num_frame_stacks:
            image_processors = [ImageProcessor(observations[j])
                               for j in range(num_frame_stacks)]
            stacked_images = StackedImages(image_processors, num_frame_stacks)
            action, output = agent.predict(stacked_images.get_stacked_images())
            reward = update_reward(p.act(action))
            total_reward += reward
            #print action, output

            #get new state and create the replay data
            observation = p.getScreenRGB()
#             replay_data.append([stacked_images, action, reward, 
#                                 ImageProcessor(observation),p.game_over()])

            #remove the oldest state so new state can be added in next iteration
            observations.popleft()

        else: ##first few frames, do nothing
            p.act(None)

        #if game over, reset the game and the observations queue
        if p.game_over():
            p.reset_game()
            observations = deque()
            break
    return total_reward

In [85]:
test_game(agent)

<__main__.PolicyAgent instance at 0x1c3333de60>
119 [[ 0.1393802  0.       ]]
119 [[ 0.13503227  0.        ]]
119 [[ 0.12506482  0.        ]]
119 [[ 0.14631693  0.        ]]
119 [[ 0.14587004  0.        ]]
119 [[ 0.13164666  0.        ]]
119 [[ 0.11812669  0.        ]]
119 [[ 0.12161004  0.        ]]
119 [[ 0.14044766  0.        ]]
119 [[ 0.12986581  0.        ]]
119 [[ 0.14485033  0.        ]]
119 [[ 0.14546552  0.        ]]
119 [[ 0.10938943  0.        ]]
119 [[ 0.1036599  0.       ]]
119 [[ 0.12555911  0.        ]]
119 [[ 0.12564251  0.        ]]
119 [[ 0.1291144  0.       ]]
119 [[ 0.14856569  0.        ]]
119 [[ 0.13094872  0.        ]]
119 [[ 0.14893465  0.        ]]
119 [[ 0.16122477  0.        ]]
119 [[ 0.15574256  0.        ]]
119 [[ 0.17841849  0.        ]]
119 [[ 0.18821411  0.        ]]
119 [[ 0.17314672  0.        ]]
119 [[ 0.19936012  0.        ]]
119 [[ 0.21840173  0.        ]]
119 [[ 0.20229319  0.        ]]
119 [[ 0.18935464  0.        ]]
119 [[ 0.18822902  0.        ]

52.0

In [140]:
class PolicyAgent():
    '''agent to learn and provide optimal actions during the game run.
    the output is the expected reward for taking that action in that state
    output:
        higher index 0 -> flap
        higher index 1 -> do nothing
        '''
    def __init__(self):
        ##cnn 
        model = Sequential()
        model.add(Conv2D(32, (8,8), activation='relu', input_shape=(input_num_rows, input_num_cols, 4)))
        model.add(MaxPooling2D(pool_size=(10,10)))
        model.add(Flatten())
        model.add(Dense(10, activation='relu'))
        #model.add(Dense(2, activation='softmax'))
        model.add(Dense(2, activation='relu'))
        #model.compile(loss='categorical_crossentropy', optimizer='sgd')
        model.compile(loss='mse', optimizer='sgd')

        self._model = model
    
    
    def fit(self, x_data, y_data, batch_size=32, epochs=1):
        x_data = x_data.reshape(1, x_data.shape[1], x_data.shape[2], x_data.shape[0])
        y_data = np.array(y_data).reshape(1,2)
        self._model.fit(x_data, y_data, batch_size, epochs, verbose=0)
        
    def predict(self, x_data):
        x_data = x_data.reshape(1, x_data.shape[1], x_data.shape[2], x_data.shape[0])
        output = self._model.predict(x_data)
        index = np.random.choice(np.flatnonzero(output==np.max(output)))
#         if output[0][0] > output[0][1]:
#             action = 119
#         else:
#             action = None
        if index==0:
            action = 119
        else:
            action = None
        return action, output
    
    def train_on_batch(self, x_data, y_data):
        #x_data = x_data.reshape(1, x_data.shape[1], x_data.shape[2], x_data.shape[0])
        self._model.train_on_batch(x_data, y_data)
        
    def summary(self):
        return self._model.summary()

In [141]:
def train_policy(agent, replay_data, num_frame_stacks=4, batch_count=0):
    '''use the replay data to train the network
    agent: PolicyAgent object
    replay_data: queue of lists
                 [0]: StackedImage object (current state)
                 [1]: Action
                 [2]: Reward
                 [3]: ImageProcessor object (next state)
                 [4]: is game terminal (boolean)
    num_frame_stack: number of frames to stack as one data sample
    batch_count: non-zero value indicates to use experience replay. 
                 # of frames to use from this replay_data to train the network
    '''
    #if batch_count is not 0, we use random sampling of the replay data
    if batch_count != 0:
        replay_data = random.sample(replay_data, batch_count)

    #inputs/targets for training the neural network
    #inputs = []
    #targets = []

    for i in range(0,len(replay_data),num_frame_stacks):
        curr_state = replay_data[i][0].get_stacked_images()

        #modified later to keep track of next stacked frames
        next_state = np.split(curr_state, num_frame_stacks, axis=0)

        action = replay_data[i][1]
        if action == 119:  #change to proper index of the action output
            action = 0
        else:
            action = 1
        reward = replay_data[i][2]
        next_frame = replay_data[i][3].gray_image\
            .reshape(1, input_num_rows, input_num_cols) #reshape to  stack properly onto the current state

        #append latest state and pop the earliest
        next_state.append(next_frame)
        next_state.pop(0)

        is_terminal = replay_data[i][4]

        #get the highest reward possible in the next state
        _, output = agent.predict(np.concatenate(next_state, axis=0))
        next_reward = np.max(output)

        #if terminal, no more rewards
        if is_terminal:
            total_reward = -5
        else:
            total_reward = reward + gamma * next_reward

        #inputs.append(curr_state)
        #targets.append(output[0])
        output[0][action] = total_reward

        agent.fit(curr_state, output[0])
        
    return agent



In [149]:
#%%time
output_file = 'logs/log'
#init the game flappybird
game = FlappyBird()

#set up the screen
p = PLE(game, fps=30, display_screen=True)

p.init()

agent = PolicyAgent()
observations = deque()

'''values to change in AWS env'''
max_frames = 105#640000 ##first frame is null
num_frame_stacks = 4
num_to_observe = 20#3200 #observations before we train the network


batch_count = int(num_to_observe * 0.2)  #train the model by selecting some subsample of the stored replays
                #contains lists of replay envs. each element is in the format
                #[state, action, reward, new_state, terminal(boolean)]
epsilon = 0.7 #choose best action 80% of the time
gamma = 0.8 #importance of future rewards compared to current reward
train_after_batch = True
count_game_over = 0
train_iteration_scores = []

replay_data = deque() 

game_over = False

#start time
start_time = timeit.default_timer()
for i in range(0, max_frames):
    if i % (max_frames/20) == 0:
        elapsed = timeit.default_timer() - start_time
        with open(output_file, 'a') as log_handle:
            log_handle.write('done with ' + str(i) + ' frames with ' + str(elapsed) + ' seconds\n')
        
        #reset timer
        start_time = timeit.default_timer()
    
    observation = p.getScreenRGB()
    if np.max(observation) != 0:
        observations.append(observation)
    
    
    ##after num_frame_stacks frames pass, we start predicting actions
    if len(observations) == num_frame_stacks:
        image_processors = [ImageProcessor(observations[j])
                           for j in range(num_frame_stacks)]
        stacked_images = StackedImages(image_processors, num_frame_stacks)
        if random.random() < epsilon:
            action, output = agent.predict(stacked_images.get_stacked_images())
        else:
            action = np.random.choice([119, None])

        ##update rewards    
        reward = update_reward(p.act(action))
        
                
        #get new state and create the replay data
        observation = p.getScreenRGB()
        replay_data.append([stacked_images, action, reward, 
                            ImageProcessor(observation),p.game_over()])
        
        #remove the oldest state
        observations.popleft()
        
    else: ##first few frames, do nothing
        p.act(None)
    
    #if game over, reset the game and the observations queue
    if p.game_over():
        count_game_over += 1
        p.reset_game()
        observations = deque()
        #print 'game reset'
        
    #once we have enough data, train on a subsample of it
    if len(replay_data) == num_to_observe:
        agent = train_policy(agent, replay_data, num_frame_stacks=num_frame_stacks, batch_count=batch_count)
        
        if train_after_batch:
            test_score = test_game(agent)
            train_iteration_scores.append([count_game_over, test_score]) 
            
        ##reset
        replay_data = deque()

with open(str(train_after_batch)+'_train_iteration_scores.pkl', 'w') as f:
    pickle.dump(train_iteration_scores,f)
print 'done'

done


In [122]:
test_game(agent)

<__main__.PolicyAgent instance at 0x1c7dfb4248>
119 [[ 1.49485052  0.        ]]
119 [[ 1.50090814  0.        ]]
119 [[ 1.50800264  0.        ]]
119 [[ 1.5117799  0.       ]]
119 [[ 1.50597656  0.        ]]
119 [[ 1.52500486  0.        ]]
119 [[ 1.51873565  0.        ]]
119 [[ 1.5251081  0.       ]]
119 [[ 1.55243731  0.        ]]
119 [[ 1.55760431  0.        ]]
119 [[ 1.56204641  0.        ]]
119 [[ 1.5768944  0.       ]]
119 [[ 1.58816552  0.        ]]
119 [[ 1.60141385  0.        ]]
119 [[ 1.62619567  0.        ]]
119 [[ 1.6374954  0.       ]]
119 [[ 1.64497173  0.        ]]
119 [[ 1.66156971  0.        ]]
119 [[ 1.65528584  0.        ]]
119 [[ 1.66183329  0.        ]]
119 [[ 1.67012346  0.        ]]
119 [[ 1.65949166  0.        ]]
119 [[ 1.66899228  0.        ]]
119 [[ 1.68996084  0.        ]]
119 [[ 1.68270075  0.        ]]
119 [[ 1.68256164  0.        ]]
119 [[ 1.6801883  0.       ]]
119 [[ 1.66394174  0.        ]]
119 [[ 1.66512096  0.        ]]
119 [[ 1.67218661  0.        ]]
11

52.0

In [132]:
import pickle
with open('train_iteration_scores.pkl', 'w') as f:
    pickle.dump(train_iteration_scores, f)

train_iteration_scores

[[0, 52.0], [0, 52.0], [1, 52.0], [1, 52.0]]

In [134]:
with open('train_iteration_scores.pkl', 'r') as f:
    test = pickle.load(f)

In [135]:
test

[[0, 52.0], [0, 52.0], [1, 52.0], [1, 52.0]]

In [12]:
##BACKUP

#%%time
#init the game flappybird
game = FlappyBird()

#set up the screen
p = PLE(game, fps=30, display_screen=True)

p.init()

agent = PolicyAgent()
observations = deque()

max_frames = 640000 ##first frame is null
num_frame_stacks = 4
num_to_observe = 3200 #observations before we train the network
batch_count = num_to_observe * 0.2  #train the model by selecting some subsample of the stored replays
                #contains lists of replay envs. each element is in the format
                #[state, action, reward, new_state, terminal(boolean)]
epsilon = 0.8 #when to choose the known best action vs random action
gamma = 0.8 #importance of future rewards compared to current reward

replay_data = deque() 

game_over = False
for i in range(0, max_frames):
    if i % 200 == 0:
        print i,
    observation = p.getScreenRGB()
    if np.max(observation) != 0:
        observations.append(observation)
    
    
    ##after num_frame_stacks frames pass, we start predicting actions
    if len(observations) == num_frame_stacks:
        image_processors = [ImageProcessor(observations[j])
                           for j in range(num_frame_stacks)]
        stacked_images = StackedImages(image_processors, num_frame_stacks)
        if random.random() < epsilon:
            action, output = agent.predict(stacked_images.get_stacked_images())
        else:
            action = np.random.choice([119, None])

        ##update rewards    
        reward = update_reward(p.act(action))
        
        
        #print action, output
        
        #get new state and create the replay data
        observation = p.getScreenRGB()
        replay_data.append([stacked_images, action, reward, 
                            ImageProcessor(observation),p.game_over()])
        
        #remove the oldest state
        observations.popleft()
        
    else: ##first few frames, do nothing
        p.act(None)
    
    #if game over, reset the game and the observations queue
    if p.game_over():
        p.reset_game()
        observations = deque()
        print 'game reset'
    
    #once we have enough data, train on a subsample of it
    if len(replay_data) == num_to_observe:
        #print 'fitting'
        replay_data = random.sample(replay_data, batch_count)
        
        #experience replay
        inputs = []
        targets = []
        for i in range(0,len(replay_data),num_frame_stacks):
            curr_state = replay_data[i][0].get_stacked_images()
            
            #modified later to keep track of next state
            next_state = np.split(curr_state, num_frame_stacks, axis=0)
            
            action = replay_data[i][1]
            if action == 119:
                action = 0
            else:
                action = 1
            reward = replay_data[i][2]
            next_frame = replay_data[i][3].gray_image\
                .reshape(1, input_num_rows, input_num_cols)
            
            #append latest state and pop the earliest
            next_state.append(next_frame)
            next_state.pop(0)
            
            is_terminal = replay_data[i][4]
            
                    
            #get the highest reward possible in the next state
            _, output = agent.predict(np.concatenate(next_state, axis=0))
            next_reward = np.max(output)
            
            #if terminal, no more rewards
            if is_terminal:
                total_reward = -5
            else:
                total_reward = reward + gamma * next_reward

            inputs.append(curr_state)
            #targets.append([0,0])
            #targets.append(output)
            targets.append(output[0])
            output[0][action] = total_reward

            agent.fit(curr_state, output[0])

        ##reset
        replay_data = deque()
#     reward = p.act(None)
#     if reward != 0 and not game_over:
#         print reward
#         game_over = True
#     if p.game_over():
#         p.reset_game()

print 'done'

0 game reset
game reset
game reset
200 game reset
game reset
game reset
400 game reset
game reset
game reset
game reset
600 game reset
game reset
game reset
800 game reset
game reset
game reset
game reset
done


In [18]:
agent._model.save('/users/momori/data/policy_agent')


In [19]:
model = load_model('/users/momori/data/policy_agent')

In [22]:
replay_data[0]

[<classes.StackedImages instance at 0x10878e518>,
 119,
 1.0,
 <classes.ImageProcessor instance at 0x1c27dafdd0>,
 False]

In [None]:
agent.predict()

In [None]:
c[1].shape
b.shape

In [None]:
def concact_ndarray_list(ndarray, *list_values):
    for i in range(len(list_values)):
        print list_values[i], '\n'
    #np.hstack([i for i in list_values])

In [None]:
concact_ndarray_list(b, c)

In [None]:
sis = []
for i in replay_data:
    sis.append(i[0])

In [None]:
sis = [i.get_stacked_images() for i in sis]

In [None]:
for image in sis:
    print 'new stack'
    for i in image:
        io.imshow(i)
        plt.show()

In [None]:
action

In [None]:
import os
##disable graphics for cloud computing
os.putenv('SDL_VIDEODRIVER', 'fbcon')
os.environ["SDL_VIDEODRIVER"] = "dummy"

#%%time
output_file = 'logs/log'
#clear log file
open(output_file, 'w').close()

#init the game flappybird
game = FlappyBird()

#set up the screen
p = PLE(game, fps=30, display_screen=False)

p.init()

agent = PolicyAgent()
observations = deque()

'''values to change in AWS env'''
max_frames = 640000 ##first frame is null
num_frame_stacks = 4
num_to_observe = 3200 #observations before we train the network


batch_count = int(num_to_observe * 0.2)  #train the model by selecting some subsample of the stored replays
                #contains lists of replay envs. each element is in the format
                #[state, action, reward, new_state, terminal(boolean)]
epsilon = 0.7 #choose best action 80% of the time
gamma = 0.8 #importance of future rewards compared to current reward
train_after_batch = True
count_game_over = 0
train_iteration_scores = []

replay_data = deque() 

game_over = False

#start time
start_time = timeit.default_timer()
for i in range(0, max_frames):
    if i % (max_frames/10) == 0:
        elapsed = timeit.default_timer() - start_time
        with open(output_file, 'a') as log_handle:
            log_handle.write('done with ' + str(i) + ' frames with ' + str(elapsed) + ' seconds\n')
        
        #reset timer
        start_time = timeit.default_timer()
    
    observation = p.getScreenRGB()
    if np.max(observation) != 0:
        observations.append(observation)
    
    
    ##after num_frame_stacks frames pass, we start predicting actions
    if len(observations) == num_frame_stacks:
        image_processors = [ImageProcessor(observations[j])
                           for j in range(num_frame_stacks)]
        stacked_images = StackedImages(image_processors, num_frame_stacks)
        if random.random() < epsilon:
            action, output = agent.predict(stacked_images.get_stacked_images())
        else:
            action = np.random.choice([119, None])

        ##update rewards    
        reward = update_reward(p.act(action))
        
                
        #get new state and create the replay data
        observation = p.getScreenRGB()
        replay_data.append([stacked_images, action, reward, 
                            ImageProcessor(observation),p.game_over()])
        
        #remove the oldest state
        observations.popleft()
        
    else: ##first few frames, do nothing
        p.act(None)
    
    #if game over, reset the game and the observations queue
    if p.game_over():
        count_game_over += 1
        p.reset_game()
        observations = deque()
        #print 'game reset'
        
    #once we have enough data, train on a subsample of it
    if len(replay_data) == num_to_observe:
        pre_training_time = timeit.default_timer()
         
        agent = train_policy(agent, replay_data, num_frame_stacks=num_frame_stacks, batch_count=batch_count)
        
        training_time = timeit.default_timer() - pre_training_time
        with open(output_file, 'a') as log_handle:  
            log_handle.write('-----------TRAINING POLICY--------------\n')
            log_handle.write('training policy done with ' + str(training_time) + '\n')
            log_handle.write('--------------------------------------\n')

        
        if train_after_batch:
            pre_testing_time = timeit.default_timer()
            test_score = test_game(agent)
            train_iteration_scores.append([count_game_over, test_score]) 
            testing_time = timeit.default_timer() - pre_testing_time
            with open(output_file, 'a') as log_handle:  
                log_handle.write('-----------TEST PLAY--------------\n')
                log_handle.write('testing done with' + str(testing_time) + '\n')
                log_handle.write('--------------------------------------\n')
            
        
        ##reset
        replay_data = deque()
with open(str(train_after_batch)+'_train_iteration_scores.pkl', 'wb') as f:
    pickle.dump(train_iteration_scores,f)
print('finished')