In [1]:
import numpy as np
import random
# Definicion de ambiente
class InventoryEnvironment:
    def __init__(self):
        self.products = ['product_A', 'product_B']
        self.max_stock = 10 # Pueden cambiar este número si gustan
        self.demand = {'product_A': [0, 1, 2], 'product_B': [0, 1, 2]}
        self.restock_cost = {'product_A': 5, 'product_B': 7}
        self.sell_price = {'product_A': 10, 'product_B': 15}
        self.state = None 
    def reset(self):
        self.state = {product: random.randint(0, self.max_stock) for product in self.products}
        return self.state
    
    def step(self, action):
        reward = 0
        for product in self.products:
            stock = self.state[product]
            restock = action[product]
            self.state[product] = min(self.max_stock, stock + restock)
            demand = random.choice(self.demand[product])
            sales = min(demand, self.state[product])
            self.state[product] -= sales
            reward += sales * self.sell_price[product] - restock * self.restock_cost[product]
        return self.state, reward
# Init el ambiente
env = InventoryEnvironment()

In [3]:
# Función para generar un episodio
def generate_episode(env, policy, max_steps=10):
    state = env.reset()
    episode = []
    for _ in range(max_steps):
        action = policy(state)
        next_state, reward = env.step(action)
        episode.append((state, action, reward, next_state))
        state = next_state
    return episode

# Política de inventario específica (ejemplo simple que reabastece al azar)
def random_policy(state):
    return {product: random.randint(0, 2) for product in state.keys()}



In [8]:
# Generar episodios y recolectar datos
env = InventoryEnvironment()
episodes = [generate_episode(env, random_policy) for _ in range(100)]

# Mostrar un ejemplo de episodio
for step in episodes[0]:
    print(step)

({'product_A': 6, 'product_B': 4}, {'product_A': 0, 'product_B': 0}, 0, {'product_A': 6, 'product_B': 4})
({'product_A': 6, 'product_B': 4}, {'product_A': 1, 'product_B': 0}, 5, {'product_A': 6, 'product_B': 4})
({'product_A': 6, 'product_B': 4}, {'product_A': 0, 'product_B': 1}, 18, {'product_A': 6, 'product_B': 4})
({'product_A': 6, 'product_B': 4}, {'product_A': 1, 'product_B': 1}, 18, {'product_A': 6, 'product_B': 4})
({'product_A': 6, 'product_B': 4}, {'product_A': 1, 'product_B': 0}, 5, {'product_A': 6, 'product_B': 4})
({'product_A': 6, 'product_B': 4}, {'product_A': 0, 'product_B': 1}, 18, {'product_A': 6, 'product_B': 4})
({'product_A': 6, 'product_B': 4}, {'product_A': 0, 'product_B': 1}, 18, {'product_A': 6, 'product_B': 4})
({'product_A': 6, 'product_B': 4}, {'product_A': 1, 'product_B': 0}, 5, {'product_A': 6, 'product_B': 4})
({'product_A': 6, 'product_B': 4}, {'product_A': 2, 'product_B': 0}, 15, {'product_A': 6, 'product_B': 4})
({'product_A': 6, 'product_B': 4}, {'prod

In [9]:
def exploring_starts(env, num_episodes=100, max_steps=10):
    episodes = []
    for _ in range(num_episodes):
        state = {product: random.randint(0, env.max_stock) for product in env.products}
        action = {product: random.randint(0, 2) for product in env.products}
        episode = []
        for _ in range(max_steps):
            next_state, reward = env.step(action)
            episode.append((state, action, reward, next_state))
            state = next_state
            action = random_policy(state)
        episodes.append(episode)
    return episodes

# Generar episodios con Exploring Starts
exploring_episodes = exploring_starts(env)

In [10]:
def epsilon_greedy_policy(state, epsilon=0.1):
    if random.random() < epsilon:
        return random_policy(state)
    else:
        return {product: 1 for product in state.keys()}

# Generar episodios usando epsilon-greedy
def generate_episode_with_epsilon(env, policy, epsilon=0.1, max_steps=10):
    state = env.reset()
    episode = []
    for _ in range(max_steps):
        action = policy(state, epsilon)
        next_state, reward = env.step(action)
        episode.append((state, action, reward, next_state))
        state = next_state
    return episode

epsilon_episodes = [generate_episode_with_epsilon(env, epsilon_greedy_policy) for _ in range(100)]


In [None]:
# Política objetivo (siempre reponer a nivel 1 por ejemplo)
def target_policy(state):
    return {product: 1 for product in state.keys()}

# Generar episodios usando aprendizaje off-policy
def generate_off_policy_episode(env, behavior_policy, target_policy, epsilon=0.1, max_steps=10):
    state = env.reset()
    episode = []
    for _ in range(max_steps):
        action = behavior_policy(state, epsilon)
        next_state, reward = env.step(action)
        episode.append((state, action, reward, next_state))
        state = next_state
    return episode

off_policy_episodes = [generate_off_policy_episode(env, epsilon_greedy_policy, target_policy) for _ in range(100)]