In [5]:
from ple.games.flappybird import FlappyBird
from ple import PLE
import random
import numpy as np
import pickle
from classes import StackedImages, ImageProcessor
from IPython.display import clear_output


from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.optimizers import SGD
from keras.models import load_model


from collections import deque

import timeit

from skimage import io
import matplotlib.pyplot as plt

#down-sized dimensions of the state representations
input_num_rows = 96
input_num_cols = 114

data_input = '/users/momori/data/stacked_images.pkl'
model_input = '/Users/momori/Documents/GitHub/Data-Science/Springboard/Capstone Project 2/True_model'

In [6]:
class PolicyAgent():
    '''agent to learn and provide optimal actions during the game run.
    the output is the expected reward for taking that action in that state
    output:
        higher index 0 -> flap
        higher index 1 -> do nothing
        '''
    def __init__(self):
        ##cnn 
        model = Sequential()
        model.add(Conv2D(32, (8,8), activation='relu', input_shape=(input_num_rows, input_num_cols, 4)))
        model.add(MaxPooling2D(pool_size=(10,10)))
        model.add(Flatten())
        model.add(Dense(10, activation='relu'))
        #model.add(Dense(2, activation='softmax'))
        model.add(Dense(2, activation='relu'))
        #model.compile(loss='categorical_crossentropy', optimizer='sgd')
        model.compile(loss='mse', optimizer='sgd')

        self._model = model
    
    
    def fit(self, x_data, y_data, batch_size=32, epochs=1):
        x_data = x_data.reshape(1, x_data.shape[1], x_data.shape[2], x_data.shape[0])
        y_data = np.array(y_data).reshape(1,2)
        self._model.fit(x_data, y_data, batch_size, epochs, verbose=0)
        
    def predict(self, x_data):
        x_data = x_data.reshape(1, x_data.shape[1], x_data.shape[2], x_data.shape[0])
        output = self._model.predict(x_data)
        index = np.random.choice(np.flatnonzero(output==np.max(output)))
#         if output[0][0] > output[0][1]:
#             action = 119
#         else:
#             action = None
        if index==0:
            action = 119
        else:
            action = None
        return action, output
    
    def train_on_batch(self, x_data, y_data):
        #x_data = x_data.reshape(1, x_data.shape[1], x_data.shape[2], x_data.shape[0])
        self._model.train_on_batch(x_data, y_data)
        
    def summary(self):
        return self._model.summary()
    
    def load(self, model):
        self._model = model

In [7]:
def update_reward(value):
    if value==0.0:
        return 0.001
    elif value==5.0:
        return 20.0
    elif value==-5.0:
        return -5.0
    else:
        print value
        return None
def test_game(agent):
    #init the game flappybird
    game = FlappyBird()

    #set up the screen
    p = PLE(game, fps=30, display_screen=True)

    p.init()

    observations = deque()

    max_frames = 1000 ##first frame is null
    num_frame_stacks = 4
    num_to_observe = 12 #observations before we train the network
    batch_count = 4 #train the model by selecting some subsample of the stored replays
                    #contains lists of replay envs. each element is in the format
                    #[state, action, reward, new_state, terminal(boolean)]
    epsilon = 0.8 #when to choose the known best action vs random action
    gamma = 0.8 #importance of future rewards compared to current reward
    
    total_reward = 0

    replay_data = deque() 

    game_over = False
    for i in range(0, max_frames):
        observation = p.getScreenRGB()
        if np.max(observation) != 0:
            observations.append(observation)


        ##after num_frame_stacks frames pass, we start predicting actions
        if len(observations) == num_frame_stacks:
            image_processors = [ImageProcessor(observations[j])
                               for j in range(num_frame_stacks)]
            stacked_images = StackedImages(image_processors, num_frame_stacks)
            action, output = agent.predict(stacked_images.get_stacked_images())
            reward = update_reward(p.act(action))
            print action, output, reward
            total_reward += reward
            #print action, output

            #get new state and create the replay data
            observation = p.getScreenRGB()
#             replay_data.append([stacked_images, action, reward, 
#                                 ImageProcessor(observation),p.game_over()])

            #remove the oldest state so new state can be added in next iteration
            observations.popleft()

        else: ##first few frames, do nothing
            p.act(None)

        #if game over, reset the game and the observations queue
        if p.game_over():
            p.reset_game()
            observations = deque()
            #break
    return total_reward

In [4]:
model = load_model(model_input)

In [None]:
agent = PolicyAgent()
agent.load(model)

In [None]:
test_game(agent)

In [None]:
model.get_weights()

In [None]:
weights = model.get_weights()

In [None]:
type(weights), len(weights)

In [None]:
weights[5]

In [None]:
model.summary()

In [None]:
a = PolicyAgent()
b = a._model.get_weights()

In [None]:
b[4]