In [20]:
import gym 
import random


import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

In [21]:
env = gym.make('CartPole-v0')
states = env.observation_space.shape[0]
actions = env.action_space.n

Öncelikle env kurmamız gerekiyor. Cart pole bu imkanız hazır olarak sunuyor ve gym.make kütüphanesi ile bunu env çağırıyoruz.  

In [22]:

episodes = 20
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = random.choice([0,1])
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))

Episode:1 Score:16.0
Episode:2 Score:19.0
Episode:3 Score:24.0
Episode:4 Score:18.0
Episode:5 Score:17.0
Episode:6 Score:17.0
Episode:7 Score:21.0
Episode:8 Score:20.0
Episode:9 Score:14.0
Episode:10 Score:12.0
Episode:11 Score:30.0
Episode:12 Score:30.0
Episode:13 Score:22.0
Episode:14 Score:10.0
Episode:15 Score:31.0
Episode:16 Score:34.0
Episode:17 Score:11.0
Episode:18 Score:28.0
Episode:19 Score:23.0
Episode:20 Score:25.0


## 2. Create a Deep Learning Model with Keras

In [23]:
def build_model(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1,states)))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [24]:
model = build_model(states, actions)

In [25]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_4 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________


model özetinde 2 full connected layer imız var 24 24 ve ilk katmanda flatten hale getiriliyor ve bu 4 yerde 24 e ordan yineee 24 yere en sonunda çıktımız olan actions sayımız kadar yere gidiyor ve yapacağı harekete karar veriyor. Sol ya da sağ karar veriyor

## 3. Build Agent with Keras-RL

In [26]:
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from tensorflow.keras.models import Sequential

In [27]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

agent bizim actions larımızı harekete geçirir. Ve puanları  agent toplar ve bunları maksimize etmeye çalışır.

Policy  agent  bir durum ile karşılaştıgında buna bir action ile karşılık verir ve policy bunu temsil eder. Bu politika basit bir aksiyon olarak tanımlanabileceği gibi bütün durumları karşılayan bir arama tablosu şeklinde de tanımlanabilir. Politika dinamik olarak da nitelenebilir. Bunun temel nedeni, ajanın içinde bulunduğu durumu değerlendirerek alabileceği aksiyonları aramasından (farkına varmasından) kaynaklanmaktadır.

In [28]:
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
    1/10000 [..............................] - ETA: 13:28 - reward: 1.0000



97 episodes - episode_reward: 102.351 [9.000, 200.000] - loss: 3.603 - mae: 20.289 - mean_q: 40.941

Interval 2 (10000 steps performed)
50 episodes - episode_reward: 200.000 [200.000, 200.000] - loss: 6.235 - mae: 40.586 - mean_q: 81.761

Interval 3 (20000 steps performed)
50 episodes - episode_reward: 200.000 [200.000, 200.000] - loss: 6.525 - mae: 42.990 - mean_q: 86.465

Interval 4 (30000 steps performed)
50 episodes - episode_reward: 199.480 [174.000, 200.000] - loss: 7.303 - mae: 42.598 - mean_q: 85.588

Interval 5 (40000 steps performed)
done, took 379.113 seconds


<tensorflow.python.keras.callbacks.History at 0x7fc3676c3220>

In [29]:
scores = dqn.test(env, nb_episodes=100, visualize=False)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 200.000, steps: 200
Episode 11: reward: 200.000, steps: 200
Episode 12: reward: 200.000, steps: 200
Episode 13: reward: 200.000, steps: 200
Episode 14: reward: 200.000, steps: 200
Episode 15: reward: 200.000, steps: 200
Episode 16: reward: 200.000, steps: 200
Episode 17: reward: 200.000, steps: 200
Episode 18: reward: 200.000, steps: 200
Episode 19: reward: 200.000, steps: 200
Episode 20: reward: 200.000, steps: 200
Episode 21: reward: 200.000, steps: 200
Episode 22: reward: 200.000, steps: 200
Episode 23: reward: 200.000, steps: 200
Episode 24: reward: 200.000, steps: 200
Episode 25: reward: 

In [34]:

_ = dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 200.000, steps: 200
