In [1]:
import gym
import random
import numpy as np
import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.estimator import regression
from statistics import mean, median
from collections import Counter

curses is not supported on this machine (please install/reinstall curses for an optimal experience)


In [2]:
lr = 1e-3
env = gym.make('CartPole-v0')
env.reset()

[2018-01-22 12:37:36,128] Making new env: CartPole-v0


array([ 0.03772903,  0.00419176, -0.04487232,  0.04364575])

In [3]:
goal_steps = 500
score_requirement = 50
initial_games = 10000

In [None]:
def some_random_games_first():
    for episode in range(5):
        env.reset()
        for t in range(goal_steps):
            env.render()
            # takes a random action from the action space
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            if done:
                break

In [None]:
some_random_games_first()

In [10]:
# generate training samples
def initial_population():
    # stores the observations that a random move made
    training_data = []
    scores = []
    # appends the data only if the corresponding score happens to be above 50
    accepted_scores = []
    for i in range(initial_games):
        score = 0
        # store game memory as we will not know until the end of the game whether or not we beat the required score
        game_memory = []
        prev_observation = []
        # iterate through the plausible steps
        for j in range(goal_steps):
            action = random.randrange(0, 2) # generates a 0 or a 1
            observation, reward, done, info = env.step(action)
            
            if len(prev_observation) > 0:
                # basically looks at the previous frame and stores the current action and if the score is high enough,
                # it will be appended to the training data as we looked at the previous frame and did something on
                # this frame that increased our score, and we would like our neural network to learn this mapping
                game_memory.append([prev_observation, action])
                
            prev_observation = observation
            score += reward
            if done:
                break
        
        if score >= score_requirement:
            accepted_scores.append(score)
            # converting output into a one-hot vector for training
            for data in game_memory:
                if data[1] == 1:
                    output = [0, 1]
                elif data[1] == 0:
                    output = [1, 0]
                    
                # appends the observation and the given output(in one-hot vector format)
                training_data.append([data[0], output])
                
        env.reset()
        scores.append(score)
    
    training_data_save = np.array(training_data)
    np.save('cartpole-v0_training_data.npy', training_data_save)
    
    print('Average accepted score: ', mean(accepted_scores))
    print('Median accepted score: ', median(accepted_scores))
    print(Counter(accepted_scores))
    
    return training_data

In [11]:
initial_population()

[2018-01-22 12:41:14,917] You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.


Average accepted score:  60.79389312977099
Median accepted score:  57.0
Counter({51.0: 37, 54.0: 33, 50.0: 33, 52.0: 25, 55.0: 21, 56.0: 19, 60.0: 18, 57.0: 17, 53.0: 17, 62.0: 17, 59.0: 12, 67.0: 11, 58.0: 11, 65.0: 11, 61.0: 11, 70.0: 9, 63.0: 9, 71.0: 8, 64.0: 7, 69.0: 6, 77.0: 6, 78.0: 6, 74.0: 5, 68.0: 5, 76.0: 4, 66.0: 3, 93.0: 3, 75.0: 3, 90.0: 3, 73.0: 2, 89.0: 2, 83.0: 2, 85.0: 2, 81.0: 2, 84.0: 1, 118.0: 1, 109.0: 1, 80.0: 1, 79.0: 1, 91.0: 1, 94.0: 1, 110.0: 1, 92.0: 1, 88.0: 1, 87.0: 1, 86.0: 1, 98.0: 1})


[[array([-0.00597893, -0.20881766, -0.03947159,  0.2866333 ]), [0, 1]],
 [array([-0.01015528, -0.01315568, -0.03373893, -0.01823271]), [0, 1]],
 [array([-0.0104184 ,  0.18243348, -0.03410358, -0.32136684]), [1, 0]],
 [array([-0.00676973, -0.01218664, -0.04053092, -0.03963092]), [0, 1]],
 [array([-0.00701346,  0.18349238, -0.04132354, -0.34482122]), [0, 1]],
 [array([-0.00334361,  0.37917707, -0.04821996, -0.65024333]), [0, 1]],
 [array([ 0.00423993,  0.57493633, -0.06122483, -0.95771209]), [1, 0]],
 [array([ 0.01573865,  0.3806887 , -0.08037907, -0.68487543]), [1, 0]],
 [array([ 0.02335243,  0.18676928, -0.09407658, -0.41854145]), [0, 1]],
 [array([ 0.02708781,  0.38308966, -0.10244741, -0.73933759]), [1, 0]],
 [array([ 0.03474961,  0.18952029, -0.11723416, -0.48057278]), [1, 0]],
 [array([ 0.03854001, -0.00376857, -0.12684561, -0.22701755]), [1, 0]],
 [array([ 0.03846464, -0.19687106, -0.13138596,  0.02311639]), [1, 0]],
 [array([ 0.03452722, -0.38988803, -0.13092364,  0.27163005]), [

In [12]:
def neural_network_model(input_size):
    network = input_data(shape=[None, input_size, 1], name='input')
    network = fully_connected(network, 128, activation='relu')
    network = dropout(network, 0.8) # keep_rate=0.8
    network = fully_connected(network, 256, activation='relu')
    network = dropout(network, 0.8)
    network = fully_connected(network, 512, activation='relu')
    network = dropout(network, 0.8)
    network = fully_connected(network, 256, activation='relu')
    network = dropout(network, 0.8)
    network = fully_connected(network, 128, activation='relu')
    network = dropout(network, 0.8)
    network = fully_connected(network, 2, activation='softmax')
    network = regression(network, optimizer='adam', learning_rate=lr, loss='categorical_crossentropy', name='targets')
    model = tflearn.DNN(network, tensorboard_dir='log')
    return model

In [15]:
def train_model(training_data, model=False):
    X = np.array([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]), 1) # grabs the observation column of the training data
    y = [i[1] for i in training_data]
    
    if not model:
        model = neural_network_model(input_size=len(X[0]))
    
    model.fit({'input':X}, {'targets':y}, n_epoch=5, snapshot_step=500, show_metric=True, run_id='cartpole-v0-v0.00')
    return model

In [16]:
training_data = initial_population()
model = train_model(training_data)

Training Step: 1754  | total loss: [1m[32m0.66691[0m[0m | time: 8.414s
| Adam | epoch: 005 | loss: 0.66691 - acc: 0.5935 -- iter: 22400/22434
Training Step: 1755  | total loss: [1m[32m0.66660[0m[0m | time: 8.446s
| Adam | epoch: 005 | loss: 0.66660 - acc: 0.5966 -- iter: 22434/22434
--


In [17]:
model.save('cartpole-v0-v0.00')

INFO:tensorflow:C:\Users\Aman Deep Singh\Documents\Python\Data Science\Machine Learning\Reinforcement Learning\cartpole-v0-v0.00 is not in all_model_checkpoint_paths. Manually adding it.


[2018-01-22 13:14:20,888] C:\Users\Aman Deep Singh\Documents\Python\Data Science\Machine Learning\Reinforcement Learning\cartpole-v0-v0.00 is not in all_model_checkpoint_paths. Manually adding it.


In [None]:
scores = []
choices = []
for each_game in range(10):
    score = 0
    game_memory = []
    prev_observation = []
    env.reset()
    for _ in range(goal_steps):
        env.render()
        if len(prev_observation) == 0:
            action = random.randrange(0, 2)
        else:
            print(model.predict(prev_observation.reshape(-1, len(prev_observation), 1)))
            action = np.argmax(model.predict(prev_observation.reshape(-1, len(prev_observation), 1))[0])
            
        choices.append(action)
        new_observation, reward, done, info = env.step(action)
        prev_observation = new_observation
        game_memory.append([new_observation, action])
        score += reward
        if done:
            break
    scores.append(score)
    
print('Average Score ', sum(scores)/len(scores))
print(f'Choice 1: {choices.count(1)/len(choices)}, Choice 0: {choices.count(0)/len(choices)}')