In [None]:
import gym
import tensorflow as tf
import numpy as np
import random
import os.path

env = gym.make('CartPole-v1')
weight_path = 'cartpole_mcq.h5'

model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(32,input_shape=(4,),activation='relu'))
model.add(tf.keras.layers.Dense(32,activation='relu'))
model.add(tf.keras.layers.Dense(2,activation='linear'))

if os.path.isfile(weight_path):
    print('loading weight')
    model.load_weights(weight_path)
    
model.compile(optimizer='adam', loss='mse')
model.summary()

In [None]:
def get_action(q):
    e = 0.05
    if random.random() < e:
        return random.randrange(0,2)
    return np.argmax(q)

while True:
    state = env.reset()
    state_array = state.reshape(1,4)
    done = False
    action_list =[]
    reward_list =[]
    q_list = []
    while not done:
        q_values = model.predict(state.reshape(1,4))
        action = get_action(q_values)
        q_list.append([q_values])
        state, reward, done, _ = env.step(action)
        action_list.append(action)
        reward_list.append(reward/500)
        if not done:
            state_array = np.append(state_array,state[np.newaxis,:],axis=0)
        env.render()

    q_array = np.squeeze( np.vstack(q_list))

    reward_sum =0
    for q,a,r in zip(q_array[::-1], action_list[::-1], reward_list[::-1]):
        reward_sum += r
        q[a] += (reward_sum -q[a]) * 0.05
        
    print(reward_sum)

    model.fit(state_array,q_array,epochs=1, verbose=0)
    
    if reward_sum >= 0.99:
        model.save_weights('cartpole_mcq.h5')
        break
        
env.close()