In [13]:
import numpy as np
import tensorflow as tf

# 환경
class Simple1DEnv:
    def __init__(self):
        self.position = np.random.randint(-5, 5)
        self.goal = 3
        self.min_pos = -5
        self.max_pos = 5
    
    def reset(self):
        self.position = np.random.randint(-5, 5)
        return np.array([self.position], dtype=np.float32)
    
    def step(self, action):
        self.position += action
        self.position = np.clip(self.position, self.min_pos, self.max_pos)
        reward = 10 if self.position == self.goal else -1
        done = self.position == self.goal
        return np.array([self.position], dtype=np.float32), reward, done

# Q-network
model = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation='relu', input_shape=(1,)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(2)  # 왼쪽(0), 오른쪽(1)
])
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
loss_fn = tf.keras.losses.MeanSquaredError()

# 학습
def train(env, model, episodes=300):
    gamma = 0.95
    epsilon = 1.0
    epsilon_decay = 0.995
    for ep in range(episodes):
        state = env.reset()
        done = False
        while not done:
            if np.random.rand() < epsilon:
                action = np.random.choice([0, 1])  # explore
            else:
                q_vals = model(np.array([state]))
                action = np.argmax(q_vals[0].numpy())  # exploit
            
            next_state, reward, done = env.step(-1 if action == 0 else 1)
            next_q = model(np.array([next_state]))
            max_next_q = np.max(next_q[0].numpy())
            target = reward + gamma * max_next_q if not done else reward

            with tf.GradientTape() as tape:
                q_val = model(np.array([state]))
                target_vec = q_val.numpy()
                target_vec[0][action] = target
                loss = loss_fn(target_vec, q_val)
            
            grads = tape.gradient(loss, model.trainable_variables) # tape함수는 with문을 빠져나왔지만 1회용으로 한번만 with문 밖에서 사용가능함.
            # 한번 쓰면 객체는 비활성화됨
            
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

            state = next_state
        
        epsilon *= epsilon_decay
        if ep % 50 == 0:
            print(f"Episode {ep}, epsilon: {epsilon:.3f}")

env = Simple1DEnv()
train(env, model)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Episode 0, epsilon: 0.995
Episode 50, epsilon: 0.774
Episode 100, epsilon: 0.603
Episode 150, epsilon: 0.469
Episode 200, epsilon: 0.365
Episode 250, epsilon: 0.284


In [None]:
for i in range(-5, 6):
    print(f'step{i}\'s proportion is', model.predict(np.array([i])))

print("학습시간: 1분 39초")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
step-5's proportion is [[-0.2727631  1.1616365]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
step-4's proportion is [[0.38832957 2.0730655 ]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
step-3's proportion is [[1.0494218 2.9844942]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
step-2's proportion is [[2.2676234 4.331146 ]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
step-1's proportion is [[3.8177817 5.937064 ]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
step0's proportion is [[5.3670454 7.5422616]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
step1's proportion is [[6.9163103 9.147459 ]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
step2's proportion is [[ 8.098839 10.339747]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [19]:
print(np.zeros((10, ), dtype=np.float32))

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
