In [1]:
# import libraries
import random
import numpy as np
from statistics import median, mean
from collections import Counter
from IPython.display import clear_output # only for jupyter notebook

In [2]:
# import game env
from puzzle import GameGrid
env = GameGrid()
env.reset()

In [3]:
# display current state
env.display_state()


[2, 0, 0, 0]
[0, 0, 0, 0]
[2, 0, 0, 0]
[0, 0, 0, 0]


# Generate Random Data
To check if its working properly

In [4]:
def some_random_games_first():
    # Each of these is its own game.
    for episode in range(5):
        env.reset()
        # this is each frame, up to 200...but we wont make it that far.
        for t in range(200):
            # This will display the environment
            # Only display if you really want to see it.
            # Takes much longer to display it.
            clear_output()
            env.display_state()
            
            # This will just create a sample action in any environment.
            # In this environment, the action can be 0 or 1, which is left or right
            action = env.action_space()
            
            # this executes the environment with an action, 
            # and returns the observation of the environment, 
            # the reward, if the env is over, and other info.
            
            observation, reward, done, info = env.step(action)
            if done:
                break

In [5]:
some_random_games_first()


[2, 2, 8, 4]
[16, 32, 2, 32]
[4, 2, 16, 4]
[2, 32, 4, 8]


# Generate and Save Training Data

In [7]:
score_requirement = 16 # Save (state, action) pair only if score is higher than score_requirement
initial_games = 10000 # number of games played
goal_steps = 10000 # number of steps in each game

# [OBS, MOVES]
training_data = []
# all rewards:
rewards = []
# just the rewards that met our threshold:
accepted_rewards = []
# list of top scores
accepted_scores = []
# iterate through however many games we want:
for _ in range(initial_games):
    score = 0
    # moves specifically from this environment:
    game_memory = []
    # previous observation that we saw
    prev_observation = []
    # for each frame in 200
    for _ in range(goal_steps):
        # choose random action (0 or 1)
        action = env.action_space()
        # do it!
        observation, reward, done, info = env.step(action)

        # notice that the observation is returned FROM the action
        # so we'll store the previous observation here, pairing
        # the prev observation to the action we'll take.
        if len(prev_observation) > 0 :
            game_memory.append([prev_observation, action])
        prev_observation = observation
        score+=reward
        if done: break

    # IF our score is higher than our threshold, we'd like to save
    # every move we made
    # NOTE the reinforcement methodology here. 
    # all we're doing is reinforcing the score, we're not trying 
    # to influence the machine in any way as to HOW that score is 
    # reached.
    if score >= score_requirement:
        accepted_rewards.append(score)
        accepted_scores.append(env.highest_score())
        for data in game_memory:
            # Create one hot vector for actions
            # ["'w'", "'s'", "'d'", "'a'"] === [UP, DOWN, RIGHT, LEFT]
            if data[1] == "'w'":
                output = [1,0,0,0]
            elif data[1] == "'s'":
                output = [0,1,0,0]
            elif data[1] == "'d'":
                output = [0,0,1,0]
            elif data[1] == "'a'":
                output = [0,0,0,1]

            # saving our training data
            training_data.append([np.array(data[0]).flatten().tolist(), output])

    # reset env to play again
    env.reset()
    # save overall scores
    rewards.append(score)

# just in case you wanted to reference later
training_data_save = np.array(training_data)
np.save('data/saved.npy',training_data_save)

# some stats here, to further illustrate the neural network magic!
print("Training Points", len(training_data))
print('Average accepted score:', mean(accepted_rewards))
print('Median score for accepted rewards:',median(accepted_rewards))
print(Counter(accepted_scores))

Average accepted score: 11241.985
Median score for accepted rewards: 9706.0
Counter({128: 5026, 64: 3477, 256: 954, 32: 524, 16: 18, 512: 1})
