In [0]:
mount_path = "/mnt/datamount"
dbutils.fs.ls(mount_path)

In [0]:
import matplotlib.pyplot as plt
from PIL import Image
import pyspark.sql.functions as F
import io

In [0]:
def display_image(df, sample_size: int = 5, keep_order = True):
    """
    Display sample images from a Spark DataFrame containing image file paths.

    Args:
        df (DataFrame): Spark DataFrame with at least 'path' and 'name' columns.
        sample_size (int): Number of images to display.
        keep_order (bool): If True, display the first N images in order.
                           If False, display N random images.

    Returns:
        None. Displays images using matplotlib.
    """
    if keep_order:
        images = df.take(sample_size) # Get the first N rows as a list.
    else:
        # Get N random rows as a list.
        # df.orderBy(F.rand()).limit(sample_size) : dataframe -> not callable
        images = df.orderBy(F.rand()).limit(sample_size).collect()

    plt.figure(figsize=(10, 10))
    for i, row in enumerate(images):
        # Replace "dbfs:/" with "/dbfs/" for local file access in Databricks.
        # why? FileNotFoundError: [Errno 2] No such file or directory: '/Workspace/Users/ark0723@gmail.com/anomaly_detection/notebook/dbfs:/mnt/datamount/images/frame_0.jpg' 
        img_path = row.path.replace("dbfs:/", "/dbfs/")
        img = Image.open(img_path)
        plt.subplot(1, sample_size, i+1)
        plt.title(row.name)
        plt.axis("off")
        plt.imshow(img)


def display_bytes_image_in_df(df, sample_size: int = 5, keep_order = True):
    """
    Display sample images from a Spark DataFrame containing image file paths.

    Args:
        df (DataFrame): Spark DataFrame with at least 'path' and 'name' columns.
        sample_size (int): Number of images to display.
        keep_order (bool): If True, display the first N images in order.
                           If False, display N random images.

    Returns:
        None. Displays images using matplotlib.
    """
    if keep_order:
        images = df.take(sample_size) # Get the first N rows as a list.
    else:
        # Get N random rows as a list.
        # df.orderBy(F.rand()).limit(sample_size) : dataframe -> not callable
        images = df.orderBy(F.rand()).limit(sample_size).collect()

    plt.figure(figsize=(10, 10))
    for i, row in enumerate(images):
        img = Image.open(io.BytesIO(row.image))
        plt.subplot(1, sample_size, i+1)
        plt.title(row.name)
        plt.axis("off")
        plt.imshow(img)

In [0]:
def create_file_info_df(imgs_dir: str):
    full_path = f"{mount_path}/{imgs_dir}"
    try:
        files = dbutils.fs.ls(full_path)
    except Exception:
        raise Exception(f"Directory {full_path} does not exist")
        return None

    file_info_list = [{"path": file.path, "name": file.name} for file in files]
    return spark.createDataFrame(file_info_list)