In [None]:
import gym
import random
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected
from statistics import median, mean
from collections import Counter
from time import sleep

In [None]:
#
# Show 10 times how OpenAI Gym works. Plot images into screen and show how many steps were executed.
# Randomly Actionable.
#
env = gym.make('CartPole-v0')
env.reset()
for _ in range(10):
    observation = env.reset()
    sleep(0.5)
    for t in range(100):
        plt.imshow(env.render(mode='rgb_array'))
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break

In [None]:
#
# Observe the environment and run accordingly with. If pole in the left move to left, if the right move to the right.
# Actionable according to the environment.
#
env = gym.make('CartPole-v0')
env.reset()
highscore = 0
for _ in range(10):
    observation = env.reset()
    sleep(0.5)
    points = 0
    while True:
        plt.imshow(env.render(mode='rgb_array'))
        action = 1 if observation[2] > 0 else 0
        observation, reward, done, info = env.step(action)
        points += reward
        if done:
            if points > highscore:
                highscore = points
            break

In [None]:
#
# Run as many games as  `initial_games` and save the ones which pass the `score_requirement`. 
# Use all these knowledge to train your network after.
#
#
goal_steps = 200
score_requirement = 100
initial_games = 50000

train_data = []
labels = []
scores = []
accepted_scores = []
for _ in range(initial_games):
    score = 0
    game_memory = []
    prev_observation = []
    for _ in range(goal_steps):
        action = random.randrange(0,2)
        observation, reward, done, info = env.step(action)
        if len(prev_observation) > 0 :
            game_memory.append([prev_observation, action])
        prev_observation = observation
        score+=reward
        if done: break

    if score >= score_requirement:
        accepted_scores.append(score)
        for data in game_memory:
            if data[1] == 1:
                output = [0,1]
            elif data[1] == 0:
                output = [1,0]
                    
            train_data.append(data[0])
            labels.append(output)

    env.reset()
    scores.append(score)
    
    
print('Average accepted score:', mean(accepted_scores))
print('Median score for accepted scores:', median(accepted_scores))
print(Counter(accepted_scores))

In [None]:
print('Starting training the Neural Network...')
print('Train data size: ', len(train_data))
train_data = np.array([i for i in train_data]).reshape(-1,len(train_data[0]))
labels = np.array(labels)

now = datetime.utcnow().strftime("%d/%m at %H:%M:%S")
root_log = "tf_logs"
logdir = "{}/run-{}/".format(root_log, now)

n_inputs = 4
n_hidden = 144
n_outputs = 2
dropout_keep_prob = 0.6

learning_rate = 0.1

n_epochs = 200

X = tf.placeholder(tf.float32, shape=(None, n_inputs))
y = tf.placeholder(tf.int32, shape=(None))

with tf.name_scope("dnn"):
    hidden_1 = fully_connected(X, n_hidden)
    dropout_1 = tf.nn.dropout(hidden_1, keep_prob=dropout_keep_prob)
    
    hidden_2 = fully_connected(dropout_1, n_hidden)
    dropout_2 = tf.nn.dropout(hidden_2, keep_prob=dropout_keep_prob)

    
    hidden_3 = fully_connected(dropout_2, n_hidden)
    dropout_3 = tf.nn.dropout(hidden_3, keep_prob=dropout_keep_prob)
    
    logits = fully_connected(dropout_3, n_outputs, activation_fn=tf.nn.softmax)

with tf.name_scope("loss"):
    xentropy = tf.keras.backend.categorical_crossentropy(target=tf.cast(y, tf.float32), output=logits)
    loss = tf.reduce_mean(xentropy)
    loss_summary = tf.summary.scalar('Loss', loss)

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.keras.metrics.top_k_categorical_accuracy(y, tf.cast(logits, tf.float32), 2)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())
saver = tf.train.Saver()

with tf.Session() as sess:
    init.run()

    for epoch in range(n_epochs + 1):
        X_batch = train_data
        y_batch = labels
        sess.run(training, feed_dict={X: X_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_loss = loss.eval(feed_dict={X: X_batch, y: y_batch})
        summary = loss_summary.eval(feed_dict={X: X_batch, y: y_batch})
        file_writer.add_summary(summary, epoch)
        print('Epoch: {} of {}, Accuracy: {}, Loss: {} \r'.format(epoch, n_epochs, acc_train, acc_loss), end="")
            

    save_path = saver.save(sess, './final.ckpt')

In [None]:
#
# Load the trained neural network and run 5 games. We get all moves which finish the game (200 moves) and
# add to training data. 
# After running the 5 games, re-run again the training part to improve more the neural network. 
#
score_requirement = 200
accepted_scores = []

dropout_keep_prob = 1.0

env = gym.make('CartPole-v0')
env.reset()

with tf.Session() as sess:
    saver.restore(sess, "./final.ckpt") 

    for each_game in range(5):
        score = 0
        game_memory = []
        previous_observation = []
        env.reset()
        sleep(0.5)
        for _ in range(goal_steps):
            plt.imshow(env.render(mode='rgb_array'))

            if len(observation)==0:
                action = random.randrange(0,2)
            else:
                X_batch = observation.reshape(-1,len(observation))
                z = logits.eval(feed_dict={X: X_batch})
                action = np.argmax(z, axis=1)[0]
        
            observation, reward, done, info = env.step(action)
                
            if len(previous_observation) > 0 :
                game_memory.append([previous_observation, action])
            
            previous_observation = observation
            score+=reward
            if done: break

        if score >= score_requirement:
            accepted_scores.append(score)
            for data in game_memory:
                if data[1] == 1:
                    output = [0,1]
                elif data[1] == 0:
                    output = [1,0]
                    
                train_data = np.append(train_data, [data[0]], axis=0)
                labels = np.append(labels, [output], axis=0)

    print(Counter(accepted_scores))