In [118]:
import gym
import random
import numpy as np
from statistics import median, mean
from collections import Counter
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

In [119]:
LR = 1e-3
env = gym.make("CartPole-v1")
env.reset()
goal_steps = 500
score_requirement = 50
initial_games = 100000

CartPoleEnv - Version 0.2.0, Noise case: 1


In [120]:
def initial_population():
    # [OBS, MOVES]
    training_data = []
    # all scores:
    scores = []
    # just the scores that met our threshold:
    accepted_scores = []
    # iterate through however many games we want:
    for _ in range(initial_games):
        score = 0
        # moves specifically from this environment:
        game_memory = []
        # previous observation that we saw
        prev_observation = []
        # for each frame in 200
        for _ in range(goal_steps):
            # choose random action (0 or 1)
            action = random.randrange(0,2)
            # do it!
            observation, reward, done, info = env.step(action)
            
            # notice that the observation is returned FROM the action
            # so we'll store the previous observation here, pairing
            # the prev observation to the action we'll take.
            if len(prev_observation) > 0 :
                game_memory.append([prev_observation, action])
            prev_observation = observation
            score+=reward
            if done: break

        # IF our score is higher than our threshold, we'd like to save
        # every move we made
        # NOTE the reinforcement methodology here. 
        # all we're doing is reinforcing the score, we're not trying 
        # to influence the machine in any way as to HOW that score is 
        # reached.
        if score >= score_requirement:
            accepted_scores.append(score)
            for data in game_memory:
                # convert to one-hot (this is the output layer for our neural network)
                if data[1] == 1:
                    output = [0,1]
                elif data[1] == 0:
                    output = [1,0]
                    
                # saving our training data
                training_data.append([data[0], output])

        # reset env to play again
        env.reset()
        # save overall scores
        scores.append(score)
    
    # just in case you wanted to reference later
    training_data_save = np.array(training_data)
    np.save('saved.npy',training_data_save)
    
    # some stats here, to further illustrate the neural network magic!
    print('Average accepted score:',mean(accepted_scores))
    print('Median score for accepted scores:',median(accepted_scores))
    print(Counter(accepted_scores))
    
    return training_data

In [None]:
x=initial_population()

In [None]:
def neural_network_model(input_size):

    model = Sequential()
    
    model.add(Dense(50, input_dim=input_size,activation='relu'))
    model.add(Dense(128,kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(256,kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512,kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(256,kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(128,kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1,kernel_initializer='normal',activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model


def train_model(training_data, model=False):

    X = np.array([i[0] for i in training_data])
    print(X.shape)
    y = [int(np.argmax(i[1])) for i in training_data]

    if not model:
        model = neural_network_model(input_size = len(X[0]))
    
    model.fit(X, y, batch_size=128, epochs=30)
    return model



In [None]:
model = train_model(x)

In [None]:
scores = []
choices = []
for each_game in range(5):
    env.render()
    score = 0
    game_memory = []
    prev_obs = []
    env.reset()
    for _ in range(goal_steps):
        
        
        if len(prev_obs)==0:
            action = random.randrange(0,2)
        else:
            action = np.argmax(model.predict(prev_obs.reshape(-1, len(prev_obs))))

        choices.append(action)
                
        new_observation, reward, done, info = env.step(action)
        prev_obs = new_observation
        game_memory.append([new_observation, action])
        score+=reward
        if done: break

    scores.append(score)

print('Average Score:',sum(scores)/len(scores))
print('choice 1:{}  choice 0:{}'.format(choices.count(1)/len(choices),choices.count(0)/len(choices)))
print(score_requirement)