In [None]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# 환경 초기화
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

# 환경 모델
model = Sequential()
model.add(Dense(24, input_dim=state_size, activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dense(action_size, activation='linear'))
model.compile(loss='mse', optimizer=Adam(learning_rate=0.001))

# 모델 기반 강화 학습 수행
def train_model():
    for episode in range(100):  # 에피소드 수
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        done = False
        total_reward = 0

        while not done:
            # 행동 선택
            action = np.argmax(model.predict(state)[0])
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])
            total_reward += reward

            # 모델 업데이트
            target = (reward + 0.99 * np.max(model.predict(next_state)[0])) * (1 - done)
            target_f = model.predict(state)
            target_f[0][action] = target
            model.fit(state, target_f, epochs=1, verbose=0)

            state = next_state

            # 실행 중 값 확인
            print(f"Episode: {episode}, State: {state}, Action: {action}, Reward: {reward}, Next State: {next_state}, Done: {done}")

        # 에피소드별 총 보상 출력
        print(f"Episode: {episode}, Total Reward: {total_reward}")

# 모델 학습
train_model()

Episode: 0, State: [[-0.03159326  0.21373169 -0.00829929 -0.2614124 ]], Action: 1, Reward: 1.0, Next State: [[-0.03159326  0.21373169 -0.00829929 -0.2614124 ]], Done: False
Episode: 0, State: [[-0.02731862  0.40897113 -0.01352754 -0.5567014 ]], Action: 1, Reward: 1.0, Next State: [[-0.02731862  0.40897113 -0.01352754 -0.5567014 ]], Done: False
Episode: 0, State: [[-0.0191392   0.60428035 -0.02466157 -0.85361546]], Action: 1, Reward: 1.0, Next State: [[-0.0191392   0.60428035 -0.02466157 -0.85361546]], Done: False
Episode: 0, State: [[-0.0070536   0.79972965 -0.04173388 -1.15395   ]], Action: 1, Reward: 1.0, Next State: [[-0.0070536   0.79972965 -0.04173388 -1.15395   ]], Done: False
Episode: 0, State: [[ 0.008941    0.99537027 -0.06481288 -1.4594216 ]], Action: 1, Reward: 1.0, Next State: [[ 0.008941    0.99537027 -0.06481288 -1.4594216 ]], Done: False
Episode: 0, State: [[ 0.0288484   1.1912245  -0.09400131 -1.7716277 ]], Action: 1, Reward: 1.0, Next State: [[ 0.0288484   1.1912245  -