# Movie Poster Processing

In [0]:
# %pip install torch databricks-vectorsearch mlflow[databricks] transformers

In [0]:
from pyspark.sql.functions import monotonically_increasing_id, col
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.vectorsearch import EndpointType
from pyspark.sql import functions as F
import time # For polling
import base64
from pyspark.sql.types import StringType

CATALOG = "movie_scripts"
SCHEMA = "ad_placement_agent"
image_table = "movie_posters"
embedding_model_path = f"{CATALOG}.{SCHEMA}.image_embedding_model"
embedding_endpoint_name = 'clip'

# Define volume path
volume_path = "/Volumes/movie_scripts/ad_placement_agent/movie_posters"

# Define target Delta table name for the images
catalog_name = "movie_scripts"
schema_name = "ad_placement_agent"
table_name = "movie_posters_table"
delta_table_full_name = f"{catalog_name}.{schema_name}.{table_name}"

# Define Vector Search endpoint and index names
#vector_search_endpoint_name = "movie_poster_vs_endpoint" # User can change this name
#vector_search_index_name = "movie_poster_image_index" # User can change this name

# Define image embedding model endpoint name (REPLACE WITH YOUR ACTUAL IMAGE EMBEDDING MODEL ENDPOINT)
# This should be a Databricks Model Serving endpoint that serves an image embedding model.
#image_embedding_model_endpoint = "my_image_embedding_model" # Placeholder

# Load Data and Store in Delta Table

In [0]:
# 1. Load Image Data
print(f"Loading image data from volume: {volume_path}")
# Read image files into a Spark DataFrame. The 'image' format automatically
# parses common image file types into a struct containing image metadata and raw bytes.
image_df = spark.read.format("delta").load(volume_path).\
  withColumnRenamed("_1", "unique_movie_id").\
  withColumnRenamed("_2", 'image_binary')

# Print schema to get a sense of the data
display(image_df.limit(5))
image_df.printSchema()

# Write to a Delta table
image_df.write.format("delta").mode("overwrite").saveAsTable(f'{CATALOG}.{SCHEMA}.{image_table}')

# Create Pyfunc Model

This will be used for image embeddings

In [0]:
import mlflow
import torch
import pandas as pd
import requests
import json

# Image embedding class
class CLIP_IMAGE_EMBEDDING(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        from transformers import CLIPProcessor, CLIPModel
        # Initialize tokenizer and model
        self.model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
        self.processor= CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
    
    def _get_image_embedding_bytearray(self, decoded_bytearray):
        import base64
        from PIL import Image
        import requests
        from io import BytesIO
        image = Image.open(BytesIO(decoded_bytearray))
        inputs = self.processor(images=image, return_tensors="pt")
        image_features = self.model.get_image_features(**inputs)
        return image_features.detach().numpy().tolist()[0]

    def predict(self, context, df):
        return df['image_binary'].apply(lambda x: self._get_image_embedding_bytearray(x))

# Test image Loading and Embedding Conversion

In [0]:
from PIL import Image
from io import BytesIO
#check out image
test_image=spark.sql(f'select image_binary from {CATALOG}.{SCHEMA}.{image_table} where image_binary is not NULL limit 1').collect()[0]['image_binary']
Image.open(BytesIO(test_image))

In [0]:
image_test_pd=spark.sql(f'select image_binary from {CATALOG}.{SCHEMA}.{image_table} where image_binary is not NULL limit 2').toPandas()
clip=CLIP_IMAGE_EMBEDDING()
clip.load_context(context=None)
test_result=clip.predict(context=None, df=image_test_pd)
test_result

# Log Model with MLflow

In [0]:

from mlflow.models.signature import ModelSignature, infer_signature

signature1 = infer_signature(image_test_pd, [test_result[0]])
signature2 = infer_signature(image_test_pd, [test_result[0]], params={'input_type':'text'})

pip_requirements=[
  "--extra-index-url https://download.pytorch.org/whl/cu121", 
  "mlflow==2.15.1",
  "setuptools<70.0.0", 
  "torch==2.3.1+cu121", 
  "accelerate==0.31.0", 
  "astunparse==1.6.3", 
  "bcrypt==3.2.0", 
  "boto3==1.34.39", 
  "configparser==5.2.0", 
  "defusedxml==0.7.1", 
  "dill==0.3.6",
   "google-cloud-storage==2.10.0", 
   "ipython==8.15.0", 
   "lz4==4.3.2", 
   "nvidia-ml-py==12.555.43", 
   "optree==0.12.1", 
   "pandas==1.5.3", 
   "pyopenssl==23.2.0", 
   "pytesseract==0.3.10", 
   "scikit-learn==1.3.0", 
   "sentencepiece==0.1.99", 
   "torchvision==0.18.1+cu121", 
   "transformers==4.41.2",
   "https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.3cxx11abiFALSE-cp311-cp311-linux_x86_64.whl"
   ]

In [0]:
# Register the model in Unity Catalog
with mlflow.start_run(run_name='scripts_clip_image') as run:  
    mlflow.pyfunc.log_model(
        registered_model_name=embedding_model_path,
        python_model=CLIP_IMAGE_EMBEDDING(),
        artifact_path="scripts_clip_image",
        signature=signature1,
        pip_requirements=pip_requirements
    )

client = mlflow.tracking.MlflowClient()

client.set_registered_model_alias(name=embedding_model_path, alias='clipimages', version=1)

client.update_model_version(
    name=embedding_model_path,
    version=1,
    description="Only does image embeddings using CLIP"
)

# Serve the Model & Query Endpoint

In [0]:
notebook_token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

# Define the endpoint URL and headers
url = "https://e2-demo-field-eng.cloud.databricks.com/api/2.0/serving-endpoints"
headers = {
    "Authorization": f"Bearer {notebook_token}",
    "Content-Type": "application/json"
}

# Define the payload for creating the model serving endpoint
payload = {
    "name": embedding_endpoint_name,
    "config": {
        "served_entities": [
            {
                "entity_name": embedding_model_path,
                "entity_version": 1,
                "workload_size": "Medium",
                "scale_to_zero_enabled": True,
                "workload_type": "GPU_SMALL"
            }
        ]
    }
}

# Make the POST request to create the serving endpoint
response = requests.post(url, headers=headers, data=json.dumps(payload))

# Check the response status
if response.status_code == 200:
    print("Model serving endpoint created successfully.")
else:
    print(f"Failed to create model serving endpoint: {response.text}")

In [0]:
image_test_pd=spark.sql(f'select image_binary from {CATALOG}.{SCHEMA}.{image_table} where image_binary is not NULL limit 2').toPandas()
image_base_64=image_test_pd.head(1).iloc[0]['image_binary']


# Define the model serving endpoint URL
endpoint_url = f"https://e2-demo-field-eng.cloud.databricks.com/serving-endpoints/{embedding_endpoint_name}/invocations"

input_data = {
  "inputs" : [image_base_64]
  # ,"params" : {'input_type':'image'} #use if using the model that can product texta nd image embeddings
}

notebook_token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()

# Set the headers for the request
headers = {
    "Content-Type": "application/json",
    "Authorization": f'Bearer {notebook_token}'
}

# Make the request to the model serving endpoint
response = requests.post(endpoint_url, headers=headers, data=json.dumps(input_data))

# Parse the response
response_data = response.json()

# Display the response data
display(response_data)