In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cp -r "/content/drive/MyDrive/data_for_diffusion.zip" "/content"


In [None]:
!unzip /content/data_for_diffusion.zip -d /content/data_for_diffusion

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/data_for_diffusion/data_for_diffusion/n2135057990_CH_19.txt  
  inflating: /content/data_for_diffusion/data_for_diffusion/n9643875887_US_21.txt  
  inflating: /content/data_for_diffusion/data_for_diffusion/n5725291300_CH_19.jpg  
  inflating: /content/data_for_diffusion/data_for_diffusion/n5725291300_CH_19.txt  
  inflating: /content/data_for_diffusion/data_for_diffusion/n6736050043_US_21.jpg  
  inflating: /content/data_for_diffusion/data_for_diffusion/n6736050043_US_21.txt  
  inflating: /content/data_for_diffusion/data_for_diffusion/w23083825_DE_.jpg  
  inflating: /content/data_for_diffusion/data_for_diffusion/a104553180_CH_20.jpg  
  inflating: /content/data_for_diffusion/data_for_diffusion/w23083825_DE_.txt  
  inflating: /content/data_for_diffusion/data_for_diffusion/a104553180_CH_20.txt  
  inflating: /content/data_for_diffusion/data_for_diffusion/w1025951207_US_20.jpg  
  inflating: /content

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import transforms

from diffusers import AutoencoderKL, UNet2DConditionModel, DDPMScheduler
from transformers import CLIPTokenizer, CLIPTextModel
from accelerate import Accelerator

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
The token `token_sd` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-aut

In [None]:
from PIL import Image, UnidentifiedImageError

def is_valid_image(path):
    try:
        with Image.open(path) as img:
            # img.load() ile resmi tamamen yüklemeyi deniyoruz.
            img.load()
        return True
    except Exception as e:
        print(f"Warning: Geçersiz veya truncated resim atlanıyor: {path} ({e})")
        return False

class SatelliteDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        """
        root_dir: Görüntü (.jpg) ve caption (.txt) dosyalarının bulunduğu klasör.
        transform: Görüntüye uygulanacak dönüşümler.
        """
        self.root_dir = root_dir
        self.transform = transform

        # .jpg uzantılı tüm dosyaları alıyoruz.
        all_image_files = [f for f in os.listdir(root_dir) if f.endswith('.jpg')]
        self.image_files = []

        for f in all_image_files:
            image_path = os.path.join(root_dir, f)
            caption_path = os.path.join(root_dir, f.replace('.jpg', '.txt'))

            # Caption dosyası yoksa, o çifti atlıyoruz.
            if not os.path.exists(caption_path):
                print(f"Warning: Caption dosyası bulunamadı, atlanıyor: {caption_path}")
                continue

            # Resmin geçerli ve eksiksiz olup olmadığını kontrol ediyoruz.
            if is_valid_image(image_path):
                self.image_files.append(f)
            else:
                continue

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        image_filename = self.image_files[idx]
        caption_filename = image_filename.replace('.jpg', '.txt')

        image_path = os.path.join(self.root_dir, image_filename)
        caption_path = os.path.join(self.root_dir, caption_filename)

        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        with open(caption_path, 'r', encoding='utf-8') as f:
            caption = f.read().strip()

        return {"image": image, "caption": caption}

from torch.utils.data import Sampler

class ResumeSampler(Sampler):
    def __init__(self, data_source, start_index=0):
        self.data_source = data_source
        self.start_index = start_index

    def __iter__(self):
        # İndeksleri, start_index'ten başlayıp dizinin sonuna kadar ve
        # daha sonra başlangıca kadar döndürür.
        indices = list(range(len(self.data_source)))
        return iter(indices[self.start_index:] + indices[:self.start_index])

    def __len__(self):
        return len(self.data_source)

# Dönüşümler
transform = transforms.Compose([
    transforms.Resize((512, 512)),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])

In [None]:
model_id = "stabilityai/stable-diffusion-2-1-base"  # Uygun model id ile değiştirin.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Tokenizer ve Text Encoder: Caption'ları işlemek için.
tokenizer = CLIPTokenizer.from_pretrained(model_id, subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(model_id, subfolder="text_encoder").to(device)

# VAE (latent uzay dönüşümü) ve UNet (gürültü tahmini) modellerini yüklüyoruz.
vae = AutoencoderKL.from_pretrained(model_id, subfolder="vae").to(device)
unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet").to(device)

# Diffusion sürecinde kullanılacak noise scheduler'ı yüklüyoruz.
noise_scheduler = DDPMScheduler.from_pretrained(model_id, subfolder="scheduler")

# VAE ve text_encoder’ın ağırlıklarını donduruyoruz (fine-tuning sırasında güncellenmeyecekler).
vae.requires_grad_(False)
text_encoder.requires_grad_(False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer%2Ftokenizer_config.json:   0%|          | 0.00/807 [00:00<?, ?B/s]

tokenizer%2Fvocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

tokenizer%2Fmerges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer%2Fspecial_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

text_encoder%2Fconfig.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.36G [00:00<?, ?B/s]

vae%2Fconfig.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

unet%2Fconfig.json:   0%|          | 0.00/911 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.46G [00:00<?, ?B/s]

scheduler%2Fscheduler_config.json:   0%|          | 0.00/346 [00:00<?, ?B/s]

CLIPTextModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 1024)
      (position_embedding): Embedding(77, 1024)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-22): 23 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (layer_norm2): LayerNorm((1

In [None]:
data_root = "/content/data_for_diffusion/data_for_diffusion"
dataset = SatelliteDataset(root_dir=data_root, transform=transform)
# dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)

# stored_index = (global_step x batch_size) % (number of all samples)
# stored_index = (240000 x 4) % 675000 = 285000
stored_index = 285000
global_step = 240000
current_index = stored_index  

sampler = ResumeSampler(dataset, start_index=current_index)
dataloader = DataLoader(dataset, batch_size=4, sampler=sampler, num_workers=1)

accelerator = Accelerator()
optimizer = optim.AdamW(unet.parameters(), lr=1e-5)
unet, optimizer, dataloader = accelerator.prepare(unet, optimizer, dataloader)



In [None]:
resume_checkpoint = "/content/drive/MyDrive/sd/checkpoint-step"


if os.path.exists(resume_checkpoint):
    accelerator.load_state(resume_checkpoint)
    print(f"Checkpoint'ten devam ediliyor: {resume_checkpoint}")
else:
    print("Checkpoint bulunamadı, baştan başlayacak.")

num_epochs = 1  
checkpoint_interval = 20000  

for epoch in range(num_epochs):
    for step, batch in enumerate(dataloader):
        images = batch["image"].to(device)
        captions = batch["caption"]

        text_inputs = tokenizer(
            captions,
            padding="max_length",
            truncation=True,
            max_length=tokenizer.model_max_length,
            return_tensors="pt"
        )
        text_input_ids = text_inputs.input_ids.to(device)

        with torch.no_grad():
            encoder_hidden_states = text_encoder(text_input_ids)[0]

        latents = vae.encode(images).latent_dist.sample()
        latents = latents * 0.18215 

        noise = torch.randn(latents.shape).to(device)
        timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (latents.shape[0],), device=device).long()

        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

        noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample

        loss = nn.MSELoss()(noise_pred, noise)

        optimizer.zero_grad()
        accelerator.backward(loss)
        optimizer.step()

        global_step += 1

        if global_step % 100 == 0:
            print(f"Epoch {epoch+1}, Step {global_step}, Loss: {loss.item()}")

        if global_step % checkpoint_interval == 0:
            checkpoint_dir = f"/content/drive/MyDrive/sd/checkpoint-step"
            os.makedirs(checkpoint_dir, exist_ok=True)
            accelerator.save_state(checkpoint_dir)
            print(f"Checkpoint kaydedildi: {checkpoint_dir}")

output_dir = "/content/drive/MyDrive/sd/fine_tuned_unet"
os.makedirs(output_dir, exist_ok=True)
accelerator.wait_for_everyone()  
unet.save_pretrained(output_dir)
print("Fine-tuning tamamlandı, model kaydedildi.")


Checkpoint'ten devam ediliyor: /content/drive/MyDrive/sd/checkpoint-step


  deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False)


Epoch 1, Step 240100, Loss: 0.12272065132856369
Epoch 1, Step 240200, Loss: 0.1889694184064865
Epoch 1, Step 240300, Loss: 0.1225949078798294
Epoch 1, Step 240400, Loss: 0.03199201822280884
Epoch 1, Step 240500, Loss: 0.25401079654693604
Epoch 1, Step 240600, Loss: 0.08147139102220535
Epoch 1, Step 240700, Loss: 0.09891625493764877
Epoch 1, Step 240800, Loss: 0.20755860209465027
Epoch 1, Step 240900, Loss: 0.14153122901916504
Epoch 1, Step 241000, Loss: 0.1747812032699585
Epoch 1, Step 241100, Loss: 0.06059031933546066
Epoch 1, Step 241200, Loss: 0.21944281458854675
Epoch 1, Step 241300, Loss: 0.2557469606399536
Epoch 1, Step 241400, Loss: 0.1352803111076355
Epoch 1, Step 241500, Loss: 0.2065563052892685
Epoch 1, Step 241600, Loss: 0.04510542377829552
Epoch 1, Step 241700, Loss: 0.03743895888328552
Epoch 1, Step 241800, Loss: 0.07981802523136139
Epoch 1, Step 241900, Loss: 0.01797499880194664
Epoch 1, Step 242000, Loss: 0.16755837202072144
Epoch 1, Step 242100, Loss: 0.1897247433662414

KeyboardInterrupt: 