In [None]:
# declaring sensitive variables
video_bucket_name = 'mybucket'
minio_ip = "172.18.0.4"
# Add IP Address of your MinIO server

In [None]:
# check if folder properly imported
import os
folder_path = '/workspace/videos'
files = os.listdir(folder_path)
print(files)

In [5]:
import boto3
s3 = boto3.client('s3', endpoint_url='http://minio:9000',
                  aws_access_key_id='admin',
                  aws_secret_access_key='password')


In [None]:
# check if bucket exists
try:
    s3.create_bucket(Bucket=video_bucket_name)
    print("Bucket created.")
except s3.exceptions.BucketAlreadyOwnedByYou:
    print("Bucket exists.")

In [None]:
# Upload each file to S3 bucket
for file_name in files:
    file_path = os.path.join(folder_path, file_name)
    if os.path.isfile(file_path):
        s3.upload_file(file_path, video_bucket_name, file_name)
        print(f"Uploaded {file_name} to {video_bucket_name}")

In [None]:
# check if sucessfully uploaded
response = s3.list_objects_v2(Bucket=video_bucket_name)
for obj in response.get('Contents', []):
    print(obj['Key'])

In [9]:
# --------------------- Uploading Metadata to Iceberg -----------------------------

In [None]:
!pip install opencv-python
import pyspark
from pyspark.sql import SparkSession
import cv2
import boto3
import tempfile

In [11]:
data_bucket_name_in_minio = video_bucket_name

CATALOG_URI = "http://nessie:19120/api/v1" ## Nessie Server URI
WAREHOUSE = "s3://" + data_bucket_name_in_minio +"/" ## S3 Address to Write to
STORAGE_URI = "http://"+ minio_ip +":9000"

In [None]:
# Initializing SPARK
conf = (
    pyspark.SparkConf()
        .setAppName('app_name')
        .set('spark.jars.packages', 'org.postgresql:postgresql:42.7.3,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.77.1,software.amazon.awssdk:bundle:2.24.8,software.amazon.awssdk:url-connection-client:2.24.8')
        .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions')
        .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')
        .set('spark.sql.catalog.nessie.uri', CATALOG_URI)
        .set('spark.sql.catalog.nessie.ref', 'main')
        .set('spark.sql.catalog.nessie.authentication.type', 'NONE')
        .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')
        .set('spark.sql.catalog.nessie.s3.endpoint', STORAGE_URI)
        .set('spark.sql.catalog.nessie.warehouse', WAREHOUSE)
        .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')
)
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [13]:
# function to extract metadata from a video file
def extract_metadata_from_video(video_path):
    cap = cv2.VideoCapture(video_path)
    duration = cap.get(cv2.CAP_PROP_FRAME_COUNT) / cap.get(cv2.CAP_PROP_FPS)
    width  = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
    height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
    cap.release()
    return duration, int(width), int(height)

In [None]:
# Extract meta data from each video
response = s3.list_objects_v2(Bucket=data_bucket_name_in_minio)

video_metadata_list = []

for obj in response.get('Contents', []):
    key = obj['Key']
    
    if key.lower().endswith('.mp4'):
        print(f"{key}")
        temp_path = tempfile.NamedTemporaryFile(delete=False).name
        s3.download_file(data_bucket_name_in_minio, key, temp_path)
        try:
            duration, width, height = extract_metadata_from_video(temp_path)
            video_metadata_list.append((key, duration, width, height))
        except Exception as e:
            print(f"Failed to extract metadata for {key}: {e}")
        
        os.remove(temp_path)
        print("done")

In [None]:
# Create Spark DataFrame from metadata and upload to iceberg
video_df = spark.createDataFrame(video_metadata_list, ["filename", "duration_sec", "width", "height"])
video_df.writeTo("nessie.video_metadata").createOrReplace()

In [None]:
# Read the Iceberg table from Nessie catalog to see if properly uploaded
video_df = spark.read.table("nessie.video_metadata")
video_df.show(truncate=False)

In [17]:
spark.stop()