bghira · bghira · Sep 8, 2023 · Sep 8, 2023 · Sep 8, 2023 · Sep 8, 2023
diff --git a/helpers/multiaspect/dataset.py b/helpers/multiaspect/dataset.py
@@ -87,6 +87,9 @@ def __len__(self):
         return len(self.bucket_manager)
 
     def __getitem__(self, image_path):
+        if image_path is False:
+            logger.debug(f'Received {image_path} instead of image path, we are assuming this is the end of an epoch, and passing it up the chain.')
+            return image_path
         logger.debug(f"Running __getitem__ for {image_path} inside Dataloader.")
         example = {"instance_images_path": image_path}
         if not StateTracker.status_training():

diff --git a/helpers/multiaspect/sampler.py b/helpers/multiaspect/sampler.py
@@ -351,6 +351,8 @@ def __iter__(self):
                     f"All buckets exhausted - since this is happening now, most likely you have chronically-underfilled buckets."
                 )
                 self._reset_buckets()
+                # Exit with a False so that the loop knows we are done this epoch.
+                return False
 
     def __len__(self):
         return sum(

diff --git a/train_sd21.py b/train_sd21.py
@@ -581,7 +581,9 @@ def main(args):
                 and epoch == first_epoch
                 and step < resume_step
             ):
-                if step + 2 == resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                if step + 1 == resume_step:
                     # We want to trigger the batch to be properly generated when we start.
                     if not StateTracker.status_training():
                         logging.info(

diff --git a/train_sdxl.py b/train_sdxl.py
@@ -773,19 +773,6 @@ def collate_fn(examples):
     ema_unet = None
     if args.use_ema:
         logger.info("Using EMA. Creating EMAModel.")
-        decay = unet.config["decay"]
-        del unet.config["decay"]
-        min_decay = unet.config["min_decay"]
-        del unet.config["min_decay"]
-        update_after_step = unet.config["update_after_step"]
-        del unet.config["update_after_step"]
-        use_ema_warmup = unet.config["use_ema_warmup"]
-        del unet.config["use_ema_warmup"]
-        inv_gamma = unet.config["inv_gamma"]
-        del unet.config["inv_gamma"]
-        power = unet.config["power"]
-        del unet.config["power"]
-
         ema_unet = EMAModel(
             unet.parameters(), model_cls=UNet2DConditionModel, model_config=unet.config
         )
@@ -975,13 +962,20 @@ def collate_fn(examples):
         unet.train()
         train_loss = 0.0
         training_luminance_values = []
+        current_epoch_step = 0
         for step, batch in enumerate(train_dataloader):
+            # If we receive a False from the enumerator, we know we reached the next epoch.
+            if batch is False:
+                logger.info(f'Reached the end of epoch {epoch}')
+                break
             # Skip steps until we reach the resumed step
             if (
                 args.resume_from_checkpoint
                 and epoch == first_epoch
                 and step < resume_step
             ):
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
                 if step + 2 == resume_step:
                     # We want to trigger the batch to be properly generated when we start.
                     if not StateTracker.status_training():
@@ -1165,6 +1159,7 @@ def collate_fn(examples):
                     ema_unet.step(unet.parameters())
                 progress_bar.update(1)
                 global_step += 1
+                current_epoch_step += 1
                 # Average out the luminance values of each batch, so that we can store that in this step.
                 avg_training_data_luminance = sum(training_luminance_values) / len(
                     training_luminance_values
@@ -1221,6 +1216,9 @@ def collate_fn(examples):
                             state_path=os.path.join(save_path, "training_state.json"),
                         )
                         logger.info(f"Saved state to {save_path}")
+                if current_epoch_step > num_update_steps_per_epoch:
+                    logger.info('Epoch {epoch} is now completed, as we have observed {current_epoch_step}/{num_update_steps_per_epoch} steps per epoch.')
+                    break
 
             logs = {
                 "step_loss": loss.detach().item(),