In [None]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
import os
import sys

drive.mount('/content/drive', force_remount=True)

# TODO: Enter the foldername in your Drive where you have saved the unzipped
# assignment folder, e.g. 'cs231n/assignments/assignment3/'
FOLDERNAME = "cs231n/lora_w_diffusion-main/"
assert FOLDERNAME is not None, "[!] Enter the foldername."
PROJECT_PATH = f"/content/drive/My Drive/{FOLDERNAME}"
sys.path.append(PROJECT_PATH)

# Change working directory
os.chdir(PROJECT_PATH)

# Confirm
print("✅ Current working directory:", os.getcwd())
print("📁 Contents:", os.listdir('.'))

In [None]:
!pip install -q diffusers transformers torchvision wandb torchmetrics[image]

# Load stable diffusion model trained with LoRA

In [None]:
# Model params
lora_rank = 2
lora_alpha = 1
epochs = 1
lr = 1e-4
dropout = 0.1
dataset = "data_chinese"
agumentation = True
conv_lora = True

device = "cuda"

run_name = f"dataset_{dataset}_rank_{lora_rank}_alpha_{lora_alpha}_epochs_{epochs}_conv_{int(conv_lora)}_lr_{lr:.0e}_dropout_{dropout}_augment_{int(agumentation)}"
print(run_name)

### Load base stable diffusion model

In [None]:
from lora import LoRALinear, LoRAConv2d
from patch_unet import patch_unet_with_lora, conv_filter

from diffusers import StableDiffusionPipeline
import torch.nn as nn
import torch

pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float32).to(device)

### Load LoRA weights and patch unet

In [None]:
import torch

model_path = f"model_weights/{run_name}.pth"

pipe.vae.requires_grad_(False)
pipe.text_encoder.requires_grad_(False)
pipe.unet.requires_grad_(False)

# ADD LORA
patch_unet_with_lora(pipe.unet, r=lora_rank, alpha=lora_alpha)
# patch_unet_with_lora(pipe.unet, r=lora_rank, alpha=lora_alpha, dropout=dropout, conv_filter=None)
pipe.unet.to(device)  # Move after patching

state_dict = torch.load(model_path, map_location=device)
pipe.unet.load_state_dict(state_dict, strict=False)
pipe.unet.eval()

# Contrastive Language-Image Pre-training (CLIP)

CLIP learns relationships between images and text. It encodes images and text and computes similarity base on semantics

* Image Encoder: Processes images and converts them into a high-dimensional vector representation (embedding) that captures the visual features of the image.
* Text Encoder: Processes text descriptions and converts them into a high-dimensional vector representation (embedding) that captures the semantic meaning of the text.


In [None]:
# Loads the CLIP model and processor
from transformers import CLIPProcessor, CLIPModel

clip_model_name ="openai/clip-vit-large-patch14" # Consistent with CompVis/stable-diffusion-v1-4
clip_model = CLIPModel.from_pretrained(clip_model_name).to(device)
clip_processor = CLIPProcessor.from_pretrained(clip_model_name)

# Utility to calculates clip similarity
from clip import calculate_clip_score

# Generate image folders from model

### Set up parameters

In [9]:
num_image_to_generate_per_prompt = 1
log_generated_image = True

# Define prompts
author_chinese = ["米芾", "褚遂良", "邓石如", "书法家"]
author_english = ["mi fu", "chu sui liang", "deng shi ru", "calligrapher"]
script_chinese = ["字体", "书法", "楷书", "行书", "草书", "隶书"]
script_english = ["script", "calligraphy", "regular script", "semi-cursive script", "cursive script", "clerical script"]
unrelated = ["cat", "tree", "dog", "sunrise", "sunset"]
confusion = ["email", "text mesage", "paper"]

# Setup generated image path
base_output_dir = f"generated_images/{run_name}"
os.makedirs(base_output_dir, exist_ok=True)

In [None]:
if log_generated_image:
  import wandb
  import os
  os.environ["WANDB_API_KEY"] = "d3cb18c48782c92319e4f2a53d26a05e702caa3e"
  wandb.login()

  wandb.init(
      project="stable-diffusion-calligraphy",
      name=run_name,  # give each experiment a unique name
      config={
          "lora_rank": lora_rank,
          "lora_alpha": lora_alpha,
          "lr": lr,
          "epochs": epochs,
          "augmentations": False,
          "conv_lora": conv_lora,
      }
  )

### Generate images by prompts

In [11]:
def generate_image(prompt, output_dir, filename_prefix):
    """
    Generates images based on a text prompt using a Stable Diffusion pipeline,
    calculates a CLIP score for each generated image, saves the image, and
    optionally logs the image and its score to Weights & Biases (wandb).
    """
    for i in range(num_image_to_generate_per_prompt):
        image = pipe(prompt, num_inference_steps=30).images[0]
        clip_score = calculate_clip_score(
            image, prompt, clip_model, clip_processor, device
        )
        print (f"{prompt} | clip_score {clip_score}")
        image_path = os.path.join(output_dir, f"{filename_prefix}_{i}.png")
        image.save(image_path)
        if log_generated_image:
          wandb.log({
            "prompt": prompt,
            "clip_score": clip_score,
            "output image": wandb.Image(image, caption=f"{prompt} | score: {clip_score:.3f}")
          })
          image.show()
          display(image)  # for inline notebook display

def generate_caligraphy_images(author_list, script_list, sub_dir):
    '''
    Generates calligraphy images for a list of authors and scripts, saving them
    into a structured directory
    '''
    output_dir = os.path.join(base_output_dir, sub_dir)
    for author in author_list:
      # Create a subdirectory for each author to compare style
      author_dir = os.path.join(output_dir, author.replace(" ", "_"))
      os.makedirs(author_dir, exist_ok=True)
      for script in script_list:
          prompt = f"{author} {script}"
          filename_prefix = prompt.replace(" ", "_")
          generate_image(prompt, author_dir, filename_prefix)

In [None]:
# Generate calligraphy with chinese prompt
generate_caligraphy_images(author_chinese, script_chinese, "chinese")

# Generate calligraphy with english prompt
generate_caligraphy_images(author_english, script_english, "english")

# Generate images with unrelated prompt
output_dir = os.path.join(base_output_dir, "unrelated")
os.makedirs(output_dir, exist_ok=True)
for prompt in unrelated:
    filename_prefix = prompt
    generate_image(prompt, output_dir, filename_prefix)

# Generate images with confusion prompt
output_dir = os.path.join(base_output_dir, "confusion")
os.makedirs(output_dir, exist_ok=True)
for prompt in confusion:
    filename_prefix = prompt
    generate_image(prompt, output_dir, filename_prefix)

In [None]:
if log_generated_image:
  wandb.finish()

 # Frechet Inception Distance (FID)

 FID assesses the similarity between a set of generated images and a set of real images by comparing their statistical properties in a feature space learned by the Inception v3 deep learning model.

 A lower FID score indicates that the statistical distribution of the generated images is more similar to that of the real images. A score of 0 would imply that the two distributions are identical.

 FID = $||\mu_r - \mu_g||^2 + Tr(\Sigma_r + \Sigma_g - 2\sqrt{\Sigma_r \Sigma_g}$

### Set up parameters for preprocess images

In [None]:
# Parameter for generate images and evaluation
from torchvision import transforms
from fid import load_images_from_folder, calculate_fid_score, add_fake_data_and_calculate_fid_score, calculate_fid_score_between_folders

# TODO: Tune transform to capture calligraphy related features in FID
# Preprocess images:
target_size = (299, 299)
# Resize and transform to gray scale.
transform = transforms.Compose([
  transforms.Resize(target_size),
  transforms.Grayscale(num_output_channels=3), # Convert to grayscale with 1 output channel
])

device = "cuda"

### Baseline: FID between two real images datasets

In [None]:
import os
from torchmetrics.image.fid import FrechetInceptionDistance
# Baseline: fid similarity between real datasets

base_input_dir = "data_curation"
show_example = False

mifu = "米芾"
chu = "褚遂良/褚遂良_千字文"
cat = "cat"
sunrise = "sunrise"

mifu_real_image_path = os.path.join(base_input_dir, "chinese", mifu)
chu_real_image_path = os.path.join(base_input_dir, "chinese", chu)
cat_real_image_path = os.path.join(base_input_dir, "unrelated", cat)
sunrise_real_image_path = os.path.join(base_input_dir, "unrelated", sunrise)

_, mifu_real_images = load_images_from_folder(mifu_real_image_path, transform, device, show_example)
_, chu_real_images = load_images_from_folder(chu_real_image_path, transform, device, show_example)
_, cat_real_images = load_images_from_folder(cat_real_image_path, transform, device, show_example)
_, sunrise_real_images = load_images_from_folder(sunrise_real_image_path, transform, device, show_example)

# FID real mifu and real mifu
score = calculate_fid_score(mifu_real_images, mifu_real_images, device)
print (f"fid score between {mifu} {mifu} : {score}")

# FID real mifu and real chu sui liang
score = calculate_fid_score(chu_real_images, mifu_real_images, device)
print (f"fid score between {chu} {mifu}: {score}")

# FID real mifu and real cat
score = calculate_fid_score(cat_real_images, mifu_real_images, device)
print (f"fid score between {cat} {mifu} : {score}")

# FID real mifu and real sun rise
score = calculate_fid_score(sunrise_real_images, mifu_real_images, device)
print (f"fid score between {sunrise} {mifu} : {score}")

### Evaluate generated images: FID between generated image and real images dataset

In [None]:
show_example = True

all_generated_images = []
for author in author_chinese:
  generated_path = os.path.join(base_output_dir, "chinese", author.replace(" ", "_"))
  _, generated_images = load_images_from_folder(generated_path, transform, device, show_example)
  all_generated_images.append((generated_images, author))

for author in author_english:
  generated_path = os.path.join(base_output_dir, "english", author.replace(" ", "_"))
  _, generated_images = load_images_from_folder(generated_path, transform, device, show_example)
  all_generated_images.append((generated_images, author))

unrelated_path = os.path.join(base_output_dir, "unrelated")
_, generated_images = load_images_from_folder(unrelated_path, transform, device, show_example)
all_generated_images.append((generated_images, "unrelated"))

confusion_path = os.path.join(base_output_dir, "confusion")
_, generated_images = load_images_from_folder(confusion_path, transform, device, show_example)
all_generated_images.append((generated_images, "confusion"))

# FID with real mi fu
for imgs in all_generated_images:
  images, image_name = imgs
  score = calculate_fid_score(images, mifu_real_images, device)
  print (f"fid score between [{image_name}] and [real mi fu]: {score}")

# FID with real chu sui liang
for imgs in all_generated_images:
  images, image_name = imgs
  score = calculate_fid_score(images, chu_real_images, device)
  print (f"fid score between [{image_name}] and [real chu sui liang]: {score}")

# FID with real cat
for imgs in all_generated_images:
  images, image_name = imgs
  score = calculate_fid_score(images, cat_real_images, device)
  print (f"fid score between [{image_name}] and [real cat]: {score}")

# FID with real sun rise
for imgs in all_generated_images:
  images, image_name = imgs
  score = calculate_fid_score(images, sunrise_real_images, device)
  print (f"fid score between [{image_name}] and [real sunrise]: {score}")