In [2]:
import gym

env = gym.make("CartPole-v1")
obs = env.reset()
done = False
while not done:
    action = env.action_space.sample()
    obs, reward, done, truncated, info = env.step(action)
env.close()
print("Gym installation seems to be working!")

Gym installation seems to be working!


In [3]:
import numpy as np
import gym
from gym import spaces

In [11]:
class EVChargingEnv(gym.Env):
    """
    A toy Markov Decision Process for EV charging, using the newer Gym API
    that returns (obs, reward, done, truncated, info) from step().
    """

    def __init__(
        self,
        num_vehicles=3,
        num_time_periods=5,
        charging_rate=0.2,
        discharging_rate=0.1,
        electricity_price=None,
        soc_min=0.0,
        soc_max=1.0,
        desired_soc=0.8,
        penalty_deviation=1.0
    ):
        super().__init__()

        self.num_vehicles = num_vehicles
        self.num_time_periods = num_time_periods
        self.charging_rate = charging_rate
        self.discharging_rate = discharging_rate
        self.soc_min = soc_min
        self.soc_max = soc_max

        # If electricity_price is not provided, use a simple placeholder array
        if electricity_price is None:
            electricity_price = np.ones(num_time_periods) * 10.0
        self.electricity_price = np.array(electricity_price)

        self.desired_soc = desired_soc
        self.penalty_deviation = penalty_deviation

        # Current time step
        self.current_time = 0

        # Define the observation space:
        # - We'll represent the state-of-charge as a float in [soc_min, soc_max]
        #   for each vehicle. So shape is (num_vehicles,).
        self.observation_space = spaces.Box(
            low=np.float32(soc_min),
            high=np.float32(soc_max),
            shape=(self.num_vehicles,),
            dtype=np.float32
        )

        # Define the action space:
        # - For each vehicle, we choose {0,1} => not charging / charging.
        # - So it's a MultiBinary space of length num_vehicles.
        self.action_space = spaces.MultiBinary(self.num_vehicles)

        # Internal state: array of shape (num_vehicles,) for SOC
        self.soc = None

    def reset(self, seed=None, options=None):
        """
        Resets the environment at the start of an episode.
        By Gym convention, returns (observation, info).
        """
        super().reset(seed=seed)
        self.current_time = 0

        # Initialize SOC in [0.5, 0.7], or set them to your desired initial conditions
        self.soc = np.random.uniform(low=0.5, high=0.7, size=(self.num_vehicles,))
        
        # Return the initial observation and an empty info dict
        return self._get_obs(), {}

    def step(self, action):
        """
        Executes one step of the environment dynamics given the action.

        Returns:
          - obs: next observation
          - reward
          - done (bool): True if episode finished due to success/failure
          - truncated (bool): True if episode ended due to timelimit or other truncation
          - info: dict with additional info
        """
        assert self.action_space.contains(action), f"Invalid action: {action}"

        # Compute reward = - (cost + penalty)
        price_t = self.electricity_price[self.current_time]
        num_charging = np.sum(action)
        cost = price_t * num_charging  # simplistic cost

        # Deviation penalty from desired SOC
        deviation = np.abs(self.soc - self.desired_soc)
        penalty = self.penalty_deviation * np.sum(deviation)

        # Typically in RL we *maximize* reward, so we make cost/penalty negative:
        reward = - (cost + penalty)

        # State transition
        for v in range(self.num_vehicles):
            if action[v] == 1:
                # charging
                self.soc[v] += self.charging_rate
            else:
                # discharging
                self.soc[v] -= self.discharging_rate

            # clamp to [soc_min, soc_max]
            self.soc[v] = np.clip(self.soc[v], self.soc_min, self.soc_max)

        self.current_time += 1

        # done = True if we reached the final time period
        done = (self.current_time >= self.num_time_periods)
        # We can optionally incorporate a time-limit or other logic to set truncated=True
        truncated = False

        info = {}
        return self._get_obs(), reward, done, truncated, info

    def _get_obs(self):
        """
        Returns the current observation, which is the array of SOCs.
        """
        return self.soc.astype(np.float32)

    def render(self):
        """
        Optional: Provide a visualization or textual printout of the environment.
        """
        print(f"Time: {self.current_time}, SOC: {self.soc}")

    def close(self):
        """
        Optional cleanup when closing the environment.
        """
        pass

In [13]:
# ---------------------------------------------------------------------------
# Example usage with the new Gym step signature:
if __name__ == "__main__":
    env = EVChargingEnv(
        num_vehicles=3,
        num_time_periods=5,
        charging_rate=0.2,
        discharging_rate=0.1,
        electricity_price=[10, 12, 8, 9, 11],
        desired_soc=0.8,
        penalty_deviation=5.0
    )

    obs, info = env.reset()
    done = False
    total_reward = 0.0
    schedule_log = []

    while not done:
        # Random action (0 or 1 for each vehicle)
        action = env.action_space.sample()
        obs, reward, done, truncated, info = env.step(action)
        # Combine done & truncated if you want a single termination condition
        done = done or truncated

        total_reward += reward
        # Store the scheduling decisions
        schedule_log.append({
            'time': env.current_time,
            'action': action.copy(),
            'SOC': obs.copy(),
            'reward': reward
        })
        env.render()

    print("Episode finished! Total reward:", total_reward)

Time: 1, SOC: [0.54633299 0.58785279 0.76547381]
Time: 2, SOC: [0.44633299 0.48785279 0.96547381]
Time: 3, SOC: [0.64633299 0.68785279 0.86547381]
Time: 4, SOC: [0.54633299 0.88785279 1.        ]
Time: 5, SOC: [0.44633299 1.         0.9       ]
Episode finished! Total reward: -80.52388342199956


In [14]:
schedule_log

[{'time': 1,
  'action': array([0, 0, 1], dtype=int8),
  'SOC': array([0.546333 , 0.5878528, 0.7654738], dtype=float32),
  'reward': -12.501702044933191},
 {'time': 2,
  'action': array([0, 0, 1], dtype=int8),
  'SOC': array([0.446333  , 0.48785278, 0.96547383], dtype=float32),
  'reward': -14.50170204493319},
 {'time': 3,
  'action': array([1, 1, 0], dtype=int8),
  'SOC': array([0.646333 , 0.6878528, 0.8654738], dtype=float32),
  'reward': -20.156440148256856},
 {'time': 4,
  'action': array([0, 1, 1], dtype=int8),
  'SOC': array([0.546333 , 0.8878528, 1.       ], dtype=float32),
  'reward': -19.656440148256856},
 {'time': 5,
  'action': array([0, 1, 0], dtype=int8),
  'SOC': array([0.446333, 1.      , 0.9     ], dtype=float32),
  'reward': -13.707599035619461}]