In [10]:
import cv2
import requests
import numpy as np
import pandas as pd
from PIL import Image
from io import BytesIO

# Load the original CSV file with THUMBNAIL URLs
df = pd.read_csv("audio_video.csv")  # Adjust the file path as needed

# Define a function to process each image with ORB
def get_image_features(url):
    try:
        # Download the image
        response = requests.get(url, timeout=5)
        img = np.array(Image.open(BytesIO(response.content)).convert("RGB"))

        # Convert to grayscale for ORB
        gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

        # Initialize ORB and compute descriptors
        orb = cv2.ORB_create(nfeatures=100)  # You can adjust the number of features
        keypoints, descriptors = orb.detectAndCompute(gray, None)
        
        # Flatten and limit the feature vector size
        if descriptors is not None:
            features = descriptors.flatten()[:2048]  # Trim or pad to keep a consistent length
        else:
            features = np.zeros(2048)  # Default vector if no descriptors found
        
        # Pad the features if they're shorter than 2048 dimensions
        if len(features) < 2048:
            features = np.pad(features, (0, 2048 - len(features)), 'constant')
        
        return features
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return np.zeros(2048)  # Default vector if there’s an error



In [20]:
# Process each URL in the THUMBNAIL column and store the features
image_features = df["THUMBNAIL"].apply(get_image_features)

Error processing https://res.cloudinary.com/encore-prod/video/upload/ar_1:1,c_fill,so_2/v1727039284/Feeds/uqipngthd87tmorqcvui.png: cannot identify image file <_io.BytesIO object at 0x17d8eaca0>
Error processing https://res.cloudinary.com/encore-prod/video/upload/ar_1:1,c_fill,so_2/v1726861517/Feeds/tjf2sgtlgdtfs2ci0x9i.png: cannot identify image file <_io.BytesIO object at 0x178d36520>
Error processing https://res.cloudinary.com/encore-prod/video/upload/ar_1:1,c_fill,so_2/v1724719859/Feeds/dqpnggwybx0fjcaddsh9.png: cannot identify image file <_io.BytesIO object at 0x17f37f920>
Error processing https://res.cloudinary.com/encore-prod/video/upload/ar_1:1,c_fill,so_2/v1724719859/Feeds/dqpnggwybx0fjcaddsh9.png: cannot identify image file <_io.BytesIO object at 0x17ec93470>
Error processing https://res.cloudinary.com/encore-prod/video/upload/ar_1:1,c_fill,so_2/v1724719859/Feeds/dqpnggwybx0fjcaddsh9.png: cannot identify image file <_io.BytesIO object at 0x17ebfdee0>
Error processing https://

In [21]:
# remove rows with np.zeros(2048)
df = df[image_features.apply(lambda x: not np.all(x == 0))]
image_features = image_features[image_features.apply(lambda x: not np.all(x == 0))]

In [25]:
print(len(image_features.values[0])) # Check the length of the first feature vector
print(len(image_features))  # Check the number of valid feature vectors
print(len(df))  # Check the number of valid rows

2048
53098
53098


In [23]:
# Convert the list of arrays into a DataFrame with 2048 columns (one column per feature dimension)
features_df = pd.DataFrame(image_features.tolist())

# Merge the features with the original DataFrame
df_with_features = pd.concat([df, features_df], axis=1)

# Save the result as a Parquet file to preserve dimensionality
df_with_features.to_parquet("image_features_with_metadata.parquet", index=False)

In [26]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("PCAExample").getOrCreate()

# Read the parquet file into a Spark DataFrame
df_with_fe = spark.read.parquet("image_features_with_metadata.parquet")

# Convert the feature columns to a single vector column
feature_columns = [f"_c{i}" for i in range(2048)]
df_with_fe = df_with_fe.withColumn("features", Vectors.dense([col(c) for c in feature_columns]))

# Perform PCA
pca = PCA(k=50, inputCol="features", outputCol="pca_features")
model = pca.fit(df_with_fe)
result = model.transform(df_with_fe)

# Show the result
result.select("pca_features").show(truncate=False)

(53658, 2054)