In [2]:
import gym
import random
import numpy as np
import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.estimator import regression
from statistics import mean,median
from collections import Counter

In [3]:
#learning rate
LR = 1e-3
env = gym.make('CartPole-v0')
env.reset()
goal_steps = 500
score_requirement = 50
initial_games = 10000

[2017-05-09 20:13:04,135] Making new env: CartPole-v0


In [4]:
def random_game():
    for episode in range(5):
        env.reset()
        for t in range(goal_steps):
            env.render()
            action = env.action_space.sample()
            observation, reward , done , info = env.step(action)
            if done:
                break
    

In [5]:
#random_game()

In [6]:
def initial_population():
    training_data = []
    scores = []
    accepted_scores = []
    for _ in range(initial_games):
        score = 0
        game_memory = []
        prev_observation = []
        for _ in range(goal_steps):
            action = env.action_space.sample()
            observation, reward , done , info = env.step(action)
            if len(prev_observation)>0 : 
                game_memory.append([prev_observation,action])
            prev_observation = observation
            score += reward
            if done:
                break
        if score >= score_requirement: 
            accepted_scores.append(score)
            for data in game_memory: 
                if data[1] == 1:
                    output = [0,1]
                elif data[1] == 0:
                    output = [1,0]
                training_data.append([data[0],output])
        env.reset()
        scores.append(score)
    training_data_save = np.array(training_data)
    np.save('saved.npy',training_data_save)
    print('average accepted scores: ' , mean(accepted_scores))
    print('median accepted scores: ' , median(accepted_scores))
    print(Counter(accepted_scores))
    return training_data
initial_population()

('average accepted scores: ', 60.835)
('median accepted scores: ', 57.0)
Counter({50.0: 40, 51.0: 37, 52.0: 33, 53.0: 25, 56.0: 19, 55.0: 17, 58.0: 17, 54.0: 16, 57.0: 14, 59.0: 14, 60.0: 13, 65.0: 11, 66.0: 11, 62.0: 10, 64.0: 10, 61.0: 9, 68.0: 9, 63.0: 8, 69.0: 8, 71.0: 7, 67.0: 6, 72.0: 6, 73.0: 6, 70.0: 5, 75.0: 5, 76.0: 5, 74.0: 4, 80.0: 3, 83.0: 3, 79.0: 2, 81.0: 2, 82.0: 2, 85.0: 2, 86.0: 2, 87.0: 2, 91.0: 2, 93.0: 2, 95.0: 2, 77.0: 1, 78.0: 1, 84.0: 1, 88.0: 1, 90.0: 1, 92.0: 1, 96.0: 1, 97.0: 1, 99.0: 1, 101.0: 1, 116.0: 1})


[[array([-0.01979001, -0.22105827, -0.03601054,  0.3124498 ]), [1, 0]],
 [array([-0.02421118, -0.4156492 , -0.02976154,  0.59356215]), [0, 1]],
 [array([-0.03252416, -0.22012357, -0.0178903 ,  0.29165499]), [0, 1]],
 [array([-0.03692663, -0.02475116, -0.0120572 , -0.00661619]), [1, 0]],
 [array([-0.03742166, -0.21969814, -0.01218952,  0.28223829]), [1, 0]],
 [array([-0.04181562, -0.41464412, -0.00654475,  0.57105191]), [1, 0]],
 [array([-0.0501085 , -0.60967368,  0.00487628,  0.86166584]), [0, 1]],
 [array([-0.06230198, -0.41461847,  0.0221096 ,  0.57052012]), [0, 1]],
 [array([-0.07059434, -0.21981344,  0.03352   ,  0.2848837 ]), [1, 0]],
 [array([-0.07499061, -0.41539703,  0.03921768,  0.58794732]), [0, 1]],
 [array([-0.08329855, -0.22084561,  0.05097662,  0.30787161]), [0, 1]],
 [array([-0.08771547, -0.02648568,  0.05713406,  0.03169107]), [0, 1]],
 [array([-0.08824518,  0.16777237,  0.05776788, -0.24243228]), [0, 1]],
 [array([-0.08488973,  0.36202362,  0.05291923, -0.51634879]), [

In [7]:
def neural_network_model(input_size):
    network = input_data(shape = [None,input_size,1],name = 'input')
    
    network = fully_connected(network, 128, activation = 'relu')
    network = dropout(network,0.8)
    network = fully_connected(network, 256, activation = 'relu')
    network = dropout(network,0.8)
    network = fully_connected(network, 512, activation = 'relu')
    network = dropout(network,0.8)
    network = fully_connected(network, 128, activation = 'relu')
    network = dropout(network,0.8)
    network = fully_connected(network, 256, activation = 'relu')
    network = dropout(network,0.8)
    
    network = fully_connected(network,2,activation = 'softmax')
    network = regression(network, optimizer = 'adam', learning_rate = LR, loss = 'categorical_crossentropy', name = 'targets')
    model = tflearn.DNN(network,tensorboard_dir='log')
    #adam derived from adaptive moment estimation
    return model

In [8]:
def train_model(training_data, model=False):

    X = np.array([i[0] for i in training_data]).reshape(-1,len(training_data[0][0]),1)
    y = [i[1] for i in training_data]

    if not model:
        model = neural_network_model(input_size = len(X[0]))
    
    model.fit({'input': X}, {'targets': y}, n_epoch=5, snapshot_step=500, show_metric=True, run_id='openai_learning')
    return model
training_data = initial_population()
model = train_model(training_data)


Training Step: 1839  | total loss: [1m[32m0.65823[0m[0m | time: 4.879s
| Adam | epoch: 005 | loss: 0.65823 - acc: 0.6050 -- iter: 23488/23540
Training Step: 1840  | total loss: [1m[32m0.65427[0m[0m | time: 4.892s
| Adam | epoch: 005 | loss: 0.65427 - acc: 0.6086 -- iter: 23540/23540
--


In [9]:
scores = []
choices = []
for each_game in range(10):
    score = 0
    game_memory = []
    prev_obs = []
    env.reset()
    for _ in range(goal_steps):
        env.render()

        if len(prev_obs)==0:
            action = random.randrange(0,2)
        else:
            action = np.argmax(model.predict(prev_obs.reshape(-1,len(prev_obs),1))[0])

        choices.append(action)
                
        new_observation, reward, done, info = env.step(action)
        prev_obs = new_observation
        game_memory.append([new_observation, action])
        score+=reward
        if done: break

    scores.append(score)

print('Average Score:',sum(scores)/len(scores))
print('choice 1:{}  choice 0:{}'.format(choices.count(1)/len(choices),choices.count(0)/len(choices)))


('Average Score:', 170.3)
choice 1:0  choice 0:0
