pre-trained: https://github.com/CompVis/stable-diffusion

https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/image_variation

In [None]:
# 필요한 라이브러리 설치
!pip install accelerate
!pip install diffusers transformers

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import random
# 이미지가 있는 디렉토리 경로 설정
directory_path = '/content/drive/MyDrive/ArtificialIntelligence/Project/train/real'

# 디렉토리 내의 모든 파일 목록 가져오기
all_files = []
for f in os.listdir(directory_path):
  full_path = os.path.join(directory_path, f)
  if os.path.isfile(full_path):
    all_files.append(full_path)

# 파일 목록에서 무작위로 선택
random.seed(42)

if len(all_files) > 10:
    selected_files = random.sample(all_files, 10)
else:
    selected_files = all_files

In [None]:
print(selected_files)

['/content/drive/MyDrive/ArtificialIntelligence/Project/train/real/22480.jpg', '/content/drive/MyDrive/ArtificialIntelligence/Project/train/real/62372.jpg', '/content/drive/MyDrive/ArtificialIntelligence/Project/train/real/69496.jpg', '/content/drive/MyDrive/ArtificialIntelligence/Project/train/real/10875.jpg', '/content/drive/MyDrive/ArtificialIntelligence/Project/train/real/45903.jpg', '/content/drive/MyDrive/ArtificialIntelligence/Project/train/real/49131.jpg', '/content/drive/MyDrive/ArtificialIntelligence/Project/train/real/51815.jpg', '/content/drive/MyDrive/ArtificialIntelligence/Project/train/real/59690.jpg', '/content/drive/MyDrive/ArtificialIntelligence/Project/train/real/10630.jpg', '/content/drive/MyDrive/ArtificialIntelligence/Project/train/real/61888.jpg']


In [None]:
selected_files[0]

'/content/drive/MyDrive/ArtificialIntelligence/Project/train/real/22480.jpg'

##### 생성된 결과가 train image와 너무 안 비슷하다. self-evolving의 개념을 활용해서 원본 이미지의 스타일과 최대한 비슷하면서도 prompt대로 따라갈 수는 없을까?

##### VGG-19 말고 Inception으로도 해보기

In [None]:
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image

# VGG19 네트워크 로드
class VGGFeatures(nn.Module):
    def __init__(self):
        super(VGGFeatures, self).__init__()
        vgg_pretrained_features = models.vgg19(pretrained=True).features

        self.layers = {
            '0': 'conv1_1',
            '5': 'conv2_1',
            '10': 'conv3_1',
            '19': 'conv4_1',
            '28': 'conv5_1'
        }

        self.features = nn.ModuleDict()
        for idx, layer_name in self.layers.items():
            self.features[layer_name] = vgg_pretrained_features[int(idx)]

        for param in self.features.parameters():
            param.requires_grad = False

    def forward(self, x):
        results = {}
        for layer_name, layer in self.features.items():
            x = layer(x)
            results[layer_name] = x
        return results


# 인스턴스화
vgg = VGGFeatures().to('cuda')

Downloading: "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth" to /root/.cache/torch/hub/checkpoints/vgg19-dcbb9e9d.pth
100%|██████████| 548M/548M [00:03<00:00, 144MB/s]


In [None]:
import torch.nn as nn
import torchvision.models as models

class InceptionFeatures(nn.Module):
    def __init__(self):
        super(InceptionFeatures, self).__init__()
        inception_pretrained_features = models.inception_v3(pretrained=True)

        # Inception v3는 aux_logits와 transform_input 등을 포함하므로 이를 조정
        self.inception = inception_pretrained_features
        self.inception.aux_logits = False
        self.inception.transform_input = False

        self.layers = {
            'Mixed_5b': 'mixed_5b',
            'Mixed_6a': 'mixed_6a',
            'Mixed_6b': 'mixed_6b',
            'Mixed_6c': 'mixed_6c',
            'Mixed_7a': 'mixed_7a'
        }

        self.features = nn.ModuleDict()
        for layer_name in self.layers.keys():
            self.features[layer_name] = getattr(self.inception, layer_name)

        for param in self.features.parameters():
            param.requires_grad = False

    def forward(self, x):
        results = {}
        # inception 모델의 preprocess 필요
        x = self.inception.Conv2d_1a_3x3(x)
        x = self.inception.Conv2d_2a_3x3(x)
        x = self.inception.Conv2d_2b_3x3(x)
        x = self.inception.maxpool1(x)
        x = self.inception.Conv2d_3b_1x1(x)
        x = self.inception.Conv2d_4a_3x3(x)
        x = self.inception.maxpool2(x)

        # 정의된 층에서 특징 추출
        for layer_name, layer in self.features.items():
            x = layer(x)
            results[layer_name] = x

        return results

# 인스턴스화
Inception = InceptionFeatures().to('cuda')

Downloading: "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth" to /root/.cache/torch/hub/checkpoints/inception_v3_google-0cc3c7bd.pth
100%|██████████| 104M/104M [00:01<00:00, 102MB/s]


In [None]:
# 이미지 전처리
def preprocess_image(image_path):
    preprocess = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    image = Image.open(image_path).convert('RGB')
    return preprocess(image) .unsqueeze(0)

def preprocess_image_wo_path(image):
    preprocess = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    return preprocess(image).unsqueeze(0)

# 스타일 특징 비교
def gram_matrix(input):
    a, b, c, d = input.size()
    features = input.view(a * b, c * d)
    G = torch.mm(features, features.t())
    return G.div(a * b * c * d)

def style_distance(target_features, style_features):
    distance = 0
    for layer in target_features.keys():
        gram_target = gram_matrix(target_features[layer])
        gram_style = gram_matrix(style_features[layer])
        distance += torch.nn.functional.l1_loss(gram_target, gram_style)
    return distance.item()

In [None]:
from diffusers import StableDiffusionImageVariationPipeline
from PIL import Image
import torch


# Stable Diffusion 모델 파이프라인을 초기화.
pipe = StableDiffusionImageVariationPipeline.from_pretrained(
    "lambdalabs/sd-image-variations-diffusers",
    revision="v2.0",
    torch_dtype=torch.float16,
    safety_checker = None,
).to("cuda")


for i in range(10):
# Real face 이미지를 로드
  input_image_path = selected_files[i]
  img = Image.open(input_image_path).convert('RGB')
  #img = preprocess_image(input_image_path)

  # 이미지 생성
  prompt = ['A single portrait of a smiling face of only one person whose gender is opposite to the input image',
            'A single portrait of a smiling face of only one person whose race is different to the input image',
            'A single portrait of a smiling face of only one person whose age range is different to the input image']

  current_prompt = random.choice(prompt)
  #print(current_prompt)

  generated_image = pipe(prompt = current_prompt, image=img, num_inference_steps=50, strength=0.8, guidance_scale=20).images[0]
  # strength: 1에 가까울 수록 원본 이미지에 가깝고, 0에 가까울 수록 prompt에 충실
  # guidance_scale: prompt를 얼마나 엄격하게 따를 것인가 7.5~20

  target_img = preprocess_image(input_image_path).to('cuda')
  gen_img = preprocess_image_wo_path(generated_image).to('cuda')

  target_feat = vgg(target_img)
  gen_feat = vgg(gen_img)

  distance = style_distance(target_feat, gen_feat)

  print(f'{i}-th image\'s Difference(distance): {distance}')

  threshold = 0.2

  if distance > threshold:
    current_prompt += 'being similar to the initial image style'
    for j in range(5):
      generated_img = pipe(prompt = current_prompt, image = img, num_inference_steps = 50, strength = 0.8, guidance_scale = 20).images[0]

      gen_img = preprocess_image_wo_path(generated_img).to('cuda')
      gen_feat = vgg(gen_img)
      distance = style_distance(target_feat, gen_feat)

      if distance <= threshold:
          break

  path = f'/content/drive/MyDrive/ArtificialIntelligence/Project/GeneratedImages/image{i}.jpg'
  generated_img.save(path)

print("Completed generating images.")

image_encoder/model.safetensors not found


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

scheduler/scheduler_config.json:   0%|          | 0.00/284 [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/595 [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/871 [00:00<?, ?B/s]

image_encoder/config.json:   0%|          | 0.00/703 [00:00<?, ?B/s]

(…)ature_extractor/preprocessor_config.json:   0%|          | 0.00/518 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

diffusion_pytorch_model.bin:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

diffusion_pytorch_model.bin:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation.StableDiffusionImageVariationPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


TypeError: StableDiffusionImageVariationPipeline.__call__() got an unexpected keyword argument 'prompt'

##### 비슷하게 만들 수 없다면, 처음부터 얼굴 부분만 segmentation해서 변형시킬 수는 없을까?