In [1]:
import numpy as np

class ReplayBuffer:
    def __init__(self, max_size, obs_dim, act_dim):
        self.max_size = max_size
        self.ptr = 0
        self.size = 0
        
        self.obs_buf = np.zeros((max_size, obs_dim), dtype=np.float32)
        self.next_obs_buf = np.zeros((max_size, obs_dim), dtype=np.float32)
        self.act_buf = np.zeros((max_size, act_dim), dtype=np.float32)
        self.rew_buf = np.zeros((max_size, 1), np.float32)
        self.done_buf = np.zeros((max_size, 1), np.float32)
        
    def store(self, obs, act, rew, next_obs, done):
        self.obs_buf[self.ptr] = obs
        self.act_buf[self.ptr] = act
        self.rew_buf[self.ptr] = rew
        self.next_obs_buf[self.ptr] = next_obs
        self.done_buf[self.ptr] = done
        
        self.ptr = (self.ptr + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)
        
    def sample_batch(self, batch_size=32):
        idxs = np.random.randint(0, self.max_size, size=batch_size)
        
        return dict(obs = self.obs_buf[idxs],
                    act = self.act_buf[idxs],
                    rews = self.rew_buf[idxs],
                    next_obs = self.next_obs_buf[idxs],
                    done = self.done_buf[idxs])

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class ActorCritic(keras.Model):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        
        self.actor = keras.Sequential([
            layers.Dense(256, activation='relu'),
            layers.LayerNormalization(),
            layers.Dense(256, activation='relu'),
            layers.Dense(act_dim, activation='tanh')
        ])
        
        self.critic = keras.Sequential([
            layers.Dense(256, activation='relu'),
            layers.LayerNormalization(),
            layers.Dense(256, activation='relu'),
            layers.Dense(1)
        ])
        
    def call(self, inputs):
        obs = inputs
        return self.actor(obs), self.critic(obs)

In [3]:
class Agent:
    def __init__(self, obs_dim, act_dim):
        self.model = ActorCritic(obs_dim, act_dim)
        self.target_model = ActorCritic(obs_dim, act_dim)
        self.target_model.set_weights(self.model.get_weights())
        
        self.buffer = ReplayBuffer(max_size=100000, obs_dim=obs_dim, act_dim=act_dim)
        self.gamma = 0.99
        self.batch_size = 64
        self.optimizer = keras.optimizers.Adam(learning_rate=3e-4)
        
    def act(self, obs):
        obs = tf.convert_to_tensor(obs.reshape(1, -1), dtype=np.float32)
        action, _ = self.model(obs)
        return action.numpy()[0]
    
    def learn(self):
        if self.batch_size > self.buffer.size:
            return
        
        batch = self.buffer.sample_batch(self.batch_size)
        
        obs = tf.convert_to_tensor(batch['obs'], dtype=np.float32)
        act = tf.convert_to_tensor(batch['act'], dtype=np.float32)
        rews = tf.convert_to_tensor(batch['rews'], dtype=np.float32)
        next_obs = tf.convert_to_tensor(batch['next_obs'], dtype=np.float32)
        done = tf.convert_to_tensor(batch['done'], dtype=np.float32)
        
        with tf.GradientTape() as tape:
            _, value = self.model(obs)
            _, next_value = self.target_model(next_obs)
            target = rews + self.gamma*(1-done)*next_value
            critic_loss = tf.reduce_mean((value - target)**2)
            
        gradient = tape.gradient(critic_loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(gradient, self.model.trainable_variables))
        
        tau = 0.005
        new_weights = []
        target_variables = self.target_model.weights
        for i, variable in enumerate(self.model.weights):
            new_weights.append(tau * variable + (1 - tau) * target_variables[i])
        self.target_model.set_weights(new_weights)

In [8]:
import gymnasium as gym

rew_step = []

env = gym.make('Ant-v5', render_mode='human')
act_dim = env.action_space.shape[0]
obs_dim = env.observation_space.shape[0]

agent = Agent(obs_dim, act_dim)

num_episodes = 5

for ep in range(num_episodes):
    
    obs, info = env.reset()
    total_reward = 0
    waiting_time = 0
    
    for t in range(1000): # 5000frame --> step*5 = frame
        action = agent.act(obs)
        noise_scale = max(0.1, 1.0 - ep / num_episodes)  # 점점 줄이기
        noise = np.random.normal(0, noise_scale, size=action.shape)
        action += noise

        action = np.clip(action, -1, 1)
        next_obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        
        shaped_reward = reward
        
        foward_progress = abs(next_obs[0] - obs[0])
        shaped_reward += foward_progress*5
        
        pre_z_posision = obs[2]
        z_position = next_obs[2]
        waiting_penalty = 0
        if abs(pre_z_posision-z_position) + abs(next_obs[0]-obs[0]) < 0.05:
            waiting_time += 1
        else:
            waiting_time = 0
            
        if waiting_time > 20:
            waiting_time = 0
            waiting_penalty = 20
            
        shaped_reward -= waiting_penalty
        waiting_penalty = 0
        
        if z_position < 0.3:
            penalty = 1
        else:
            penalty = 0
        shaped_reward -= penalty
        
        agent.buffer.store(obs, action, shaped_reward, next_obs, done)
        agent.learn()
        
        obs = next_obs
        total_reward += shaped_reward
        
        if done:
            break
        
    # Actor만 따로 저장
    actor_model = agent.model.actor
    actor_model.save("actor_model.keras")

        
    print(f"episode: {ep}, total reward: {total_reward}")
    rew_step.append(total_reward)

input()
env.close()

episode: 0, total reward: -35.18639724309523
episode: 1, total reward: -77.57735372755681




episode: 2, total reward: -148.10883900830652
episode: 3, total reward: -237.88043305341202
episode: 4, total reward: -2095.3317022042615


In [5]:
import tensorflow as tf

# 1. 저장된 모델 불러오기
model = tf.keras.models.load_model("actor_model.keras")

# 2. 기본 TFLite 변환 (float32 모델)
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

# 3. 저장
with open("actor_model.tflite", "wb") as f:
    f.write(tflite_model)

# 4. 동적 범위 양자화 적용
converter = tf.lite.TFLiteConverter.from_keras_model(model)  # 다시 converter 초기화
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_quant_model = converter.convert()

# 5. 양자화된 모델 저장
with open("actor_model_quant_dynamic.tflite", "wb") as f:
    f.write(tflite_quant_model)

INFO:tensorflow:Assets written to: C:\Users\dlwns\AppData\Local\Temp\tmpl9nqh7ba\assets


INFO:tensorflow:Assets written to: C:\Users\dlwns\AppData\Local\Temp\tmpl9nqh7ba\assets


Saved artifact at 'C:\Users\dlwns\AppData\Local\Temp\tmpl9nqh7ba'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(1, 105), dtype=tf.float32, name='input_layer')
Output Type:
  TensorSpec(shape=(1, 8), dtype=tf.float32, name=None)
Captures:
  1787755393488: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1787755381584: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1787363675792: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1787363684816: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1787363684624: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1787363680976: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1787363673680: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1787363681936: TensorSpec(shape=(), dtype=tf.resource, name=None)
INFO:tensorflow:Assets written to: C:\Users\dlwns\AppData\Local\Temp\tmpyuvbnxnq\assets


INFO:tensorflow:Assets written to: C:\Users\dlwns\AppData\Local\Temp\tmpyuvbnxnq\assets


Saved artifact at 'C:\Users\dlwns\AppData\Local\Temp\tmpyuvbnxnq'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(1, 105), dtype=tf.float32, name='input_layer')
Output Type:
  TensorSpec(shape=(1, 8), dtype=tf.float32, name=None)
Captures:
  1787755393488: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1787755381584: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1787363675792: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1787363684816: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1787363684624: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1787363680976: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1787363673680: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1787363681936: TensorSpec(shape=(), dtype=tf.resource, name=None)


In [12]:
import tensorflow as tf
import numpy as np
import gymnasium as gym

interpreter = tf.lite.Interpreter(model_path="actor_model_quant_dynamic.tflite")
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

env = gym.make('Ant-v5', render_mode='human')
obs, _ = env.reset()

for t in range(1000):
    input_data = np.array(obs, dtype=np.float32).reshape(1, -1)
    interpreter.set_tensor(input_details[0]['index'], input_data)
    interpreter.invoke()
    action = interpreter.get_tensor(output_details[0]['index'])[0]
    
    obs, reward, terminated, truncated, _ = env.step(action)
    if terminated or truncated:
        break
    
a = input()
env.close()