In [None]:
# Clone the repository
!git clone https://github.com/alessioborgi/StyleAlignedDiffModels.git

# Change directory to the cloned repository
%cd StyleAlignedDiffModels
%ls

# Set up Git configuration
!git config --global user.name "Alessio Borgi"
!git config --global user.email "alessioborgi3@gmail.com"

# Stage the changes
#!git add .

# Commit the changes
#!git commit -m "Added some content to your-file.txt"

# Push the changes (replace 'your-token' with your actual personal access token)
#!git push origin main

In [None]:
# Install the required packages
!pip install -r requirements.txt > /dev/null

In [None]:
from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoencoderKL
from diffusers.utils import load_image
from transformers import DPTImageProcessor, DPTForDepthEstimation
import torch
import mediapy
import sa_handler
import pipeline_calls

In [None]:
# init models

depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
feature_processor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")

controlnet = ControlNetModel.from_pretrained(
    "diffusers/controlnet-depth-sdxl-1.0",
    variant="fp16",
    use_safetensors=True,
    torch_dtype=torch.float16,
).to("cuda")
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to("cuda")
pipeline = StableDiffusionXLControlNetPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    controlnet=controlnet,
    vae=vae,
    variant="fp16",
    use_safetensors=True,
    torch_dtype=torch.float16,
).to("cuda")
pipeline.enable_model_cpu_offload()

sa_args = sa_handler.StyleAlignedArgs(share_group_norm=False,
                                      share_layer_norm=False,
                                      share_attention=True,
                                      adain_queries=True,
                                      adain_keys=True,
                                      adain_values=False,
                                     )
handler = sa_handler.Handler(pipeline)
handler.register(sa_args, )

In [None]:
# get depth maps

image = load_image("./imgs/train.png")
depth_image1 = pipeline_calls.get_depth_map(image, feature_processor, depth_estimator)
depth_image2 = load_image("./imgs/sun.png").resize((1024, 1024))
depth_image3 = load_image("./imgs/lena.png").resize((1024, 1024))
mediapy.show_images([depth_image1, depth_image2, depth_image3])

In [None]:
# run ControlNet depth with StyleAligned

reference_prompt = "a poster in flat design style"
target_prompts = ["a train in flat design style", "the sun in flat design style", "a model in a flat design style"]
controlnet_conditioning_scale = 0.8
num_images_per_prompt = 3 # adjust according to VRAM size
latents = torch.randn(1 + num_images_per_prompt, 4, 128, 128).to(pipeline.unet.dtype)
for deph_map, target_prompt in zip((depth_image1, depth_image2, depth_image3), target_prompts):
    latents[1:] = torch.randn(num_images_per_prompt, 4, 128, 128).to(pipeline.unet.dtype)
    images = pipeline_calls.controlnet_call(pipeline, [reference_prompt, target_prompt],
                                            image=deph_map,
                                            num_inference_steps=50,
                                            controlnet_conditioning_scale=controlnet_conditioning_scale,
                                            num_images_per_prompt=num_images_per_prompt,
                                            latents=latents)
    
    mediapy.show_images([images[0], deph_map] +  images[1:], titles=["reference", "depth"] + [f'result {i}' for i in range(1, len(images))])


### ADDITIONAL FUNCTIONS

### EDGE MAP

In [None]:
def get_edge_map(image: Image) -> Image:
    image_np = np.array(image.convert("L"))  # Convert to grayscale
    edges = cv2.Canny(image_np, threshold1=100, threshold2=200)
    edges_pil = Image.fromarray(edges)
    edges_pil = edges_pil.resize((1024, 1024), Image.BICUBIC)
    edges_np = np.array(edges_pil)
    edges_3_channel = np.stack([edges_np] * 3, axis=-1)
    edges_pil = Image.fromarray(edges_3_channel)
    return edges_pil

In [None]:
# Example usage
image = Image.open("./path_to_image.jpg")
edge_map = get_edge_map(image)
edge_map.show()

### OPENPOSE

In [None]:
!pip install openpose-python

In [None]:
from openpose import pyopenpose as op

def get_pose_map(image: Image, openpose_params: dict) -> Image:
    # Convert the input image to a NumPy array
    image_np = np.array(image)

    # Initialize OpenPose with the given parameters
    op_wrapper = op.WrapperPython()
    op_wrapper.configure(openpose_params)
    op_wrapper.start()

    # Prepare the input image for OpenPose
    datum = op.Datum()
    datum.cvInputData = image_np
    op_wrapper.emplaceAndPop([datum])

    # Get the pose estimation result
    pose_keypoints = datum.poseKeypoints

    # Create a blank image to draw the pose skeleton
    pose_map = np.zeros_like(image_np)

    # Draw the skeleton on the blank image
    for person in pose_keypoints:
        for keypoint in person:
            x, y, confidence = keypoint
            if confidence > 0.1:  # Draw only if confidence is high enough
                cv2.circle(pose_map, (int(x), int(y)), 5, (255, 255, 255), -1)

    # Convert the pose map back to a PIL image
    pose_map_pil = Image.fromarray(pose_map)

    # Resize to 1024x1024
    pose_map_pil = pose_map_pil.resize((1024, 1024), Image.BICUBIC)

    return pose_map_pil

In [None]:
# Example usage
image = Image.open("./path_to_image.jpg")
openpose_params = {
    "model_folder": "./models/",
    "net_resolution": "-1x256",
    "hand": False,
    "face": False,
}
pose_map = get_pose_map(image, openpose_params)
pose_map.show()

### SCRIBBLES

In [None]:
import cv2
import numpy as np
from PIL import Image, ImageDraw

def get_scribble(image: Image) -> Image:
    # Convert the input image to grayscale
    image_gray = np.array(image.convert("L"))
    
    # Detect edges using Canny edge detection
    edges = cv2.Canny(image_gray, threshold1=50, threshold2=150)
    
    # Create a blank image to draw scribbles
    scribble = np.zeros_like(edges)
    
    # Simulate hand-drawn effect by dilating the edges
    kernel = np.ones((5, 5), np.uint8)
    scribble = cv2.dilate(edges, kernel, iterations=1)
    
    # Convert the scribble back to an RGB image
    scribble_rgb = cv2.cvtColor(scribble, cv2.COLOR_GRAY2RGB)
    
    # Convert to PIL Image
    scribble_pil = Image.fromarray(scribble_rgb)
    
    # Resize to 1024x1024
    scribble_pil = scribble_pil.resize((1024, 1024), Image.BICUBIC)
    
    return scribble_pil

# Example usage
image = Image.open("./path_to_image.jpg")
scribble = get_scribble(image)
scribble.show()

### Optical Flow Maps

In [None]:
import cv2
import numpy as np
from PIL import Image

def get_optical_flow_map(image1: Image, image2: Image) -> Image:
    # Convert the input images to grayscale
    image1_gray = np.array(image1.convert("L"))
    image2_gray = np.array(image2.convert("L"))
    
    # Calculate optical flow using Farneback's algorithm
    flow = cv2.calcOpticalFlowFarneback(image1_gray, image2_gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)
    
    # Convert flow to RGB image
    h, w = flow.shape[:2]
    flow_map = np.zeros((h, w, 3), dtype=np.uint8)
    
    # Normalize flow vectors
    flow_magnitude, flow_angle = cv2.cartToPolar(flow[..., 0], flow[..., 1])
    flow_map[..., 0] = flow_angle * 180 / np.pi / 2  # Hue
    flow_map[..., 2] = cv2.normalize(flow_magnitude, None, 0, 255, cv2.NORM_MINMAX)  # Value
    
    # Convert to BGR for cv2 to RGB for PIL
    flow_map = cv2.cvtColor(flow_map, cv2.COLOR_HSV2RGB)
    
    # Convert to PIL Image
    flow_map_pil = Image.fromarray(flow_map)
    
    # Resize to 1024x1024
    flow_map_pil = flow_map_pil.resize((1024, 1024), Image.BICUBIC)
    
    return flow_map_pil

In [None]:
from PIL import Image

# Load two consecutive frames from a video or image sequence
image1 = Image.open("./path_to_first_frame.jpg")
image2 = Image.open("./path_to_second_frame.jpg")

# Generate the optical flow map
optical_flow_map = get_optical_flow_map(image1, image2)

# Display the optical flow map
optical_flow_map.show()

### 3D Point Clouds

In [None]:
import numpy as np
from PIL import Image
import open3d as o3d

def get_point_cloud(image: Image, depth_map: Image, focal_length: float, principal_point: tuple) -> o3d.geometry.PointCloud:
    # Convert the image and depth map to NumPy arrays
    image_np = np.array(image)
    depth_np = np.array(depth_map).astype(np.float32) / 255.0  # Normalize depth map to [0, 1]
    
    # Get image dimensions
    h, w = depth_np.shape[:2]

    # Create a mesh grid of pixel coordinates
    i, j = np.meshgrid(np.arange(w), np.arange(h), indexing='xy')
    
    # Convert pixel coordinates to camera coordinates
    x = (i - principal_point[0]) * depth_np / focal_length
    y = (j - principal_point[1]) * depth_np / focal_length
    z = depth_np

    # Stack to create a 3D point cloud (h*w, 3)
    points = np.stack((x, y, z), axis=-1).reshape(-1, 3)
    
    # Create Open3D PointCloud object
    point_cloud = o3d.geometry.PointCloud()
    point_cloud.points = o3d.utility.Vector3dVector(points)
    
    # Add colors to the point cloud
    colors = image_np.reshape(-1, 3) / 255.0
    point_cloud.colors = o3d.utility.Vector3dVector(colors)
    
    return point_cloud

In [None]:
from PIL import Image
import open3d as o3d

# Load an image and its corresponding depth map
image = Image.open("./path_to_image.jpg")
depth_map = Image.open("./path_to_depth_map.png")  # Assuming this is a grayscale image

# Example camera intrinsic parameters
focal_length = 525.0  # Example focal length
principal_point = (319.5, 239.5)  # Example principal point (cx, cy)

# Generate the 3D point cloud
point_cloud = get_point_cloud(image, depth_map, focal_length, principal_point)

# Visualize the point cloud
o3d.visualization.draw_geometries([point_cloud])