# Perceptual hashing

In [6]:
import cv2
import imagehash
from PIL import Image

def generate_video_fingerprint(video_path, interval_seconds=2):
    """
    Samples a video every X seconds and returns a list of 64-bit binary pHashes.
    """
    cap = cv2.VideoCapture(video_path)
    
    if not cap.isOpened():
        print(f"Error: Could not open video {video_path}")
        return []

    # Get video properties
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = total_frames / fps
    
    hashes = []
    
    # Calculate frame step (e.g., at 30fps, every 2s is 60 frames)
    frame_step = int(fps * interval_seconds)

    current_frame = 0
    while current_frame < total_frames:
        # Set the video position to the specific frame
        cap.set(cv2.CAP_PROP_POS_FRAMES, current_frame)
        success, frame = cap.read()
        
        if not success:
            break

        # 1. Convert OpenCv BGR to PIL RGB
        color_coverted = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_img = Image.fromarray(color_coverted)

        # 2. Compute pHash
        # phash returns a 64-bit hash (8x8 grid)
        hash_obj = imagehash.phash(pil_img)
        
        # 3. Convert Hex to 64-bit Binary String for Supabase BIT(64)
        # We use zfill(64) to ensure leading zeros aren't lost
        hash_as_int = int(str(hash_obj), 16)
        binary_hash = bin(hash_as_int)[2:].zfill(64)

        hashes.append({
            "timestamp_seconds": round(current_frame / fps, 2),
            "hash": binary_hash
        })

        # Advance to the next 2-second mark
        current_frame += frame_step
    cap.release()
    return hashes

def are_identical(first_path, second_path):
    hashes1 = generate_video_fingerprint(first_path, 2)
    hashes2 = generate_video_fingerprint(second_path,4)
    # INSERT_YOUR_CODE
    def hamming_distance(hash1, hash2):
        """Compute the Hamming distance between two 64-bit binary strings."""
        # Assumes input are 64-char strings of '0' and '1'
        return sum(c1 != c2 for c1, c2 in zip(hash1, hash2))

    # For each timestamped hash in hashes1, find the minimum Hamming distance to hashes2
    # If >70% of hashes have corresponding matches within a small threshold (e.g., <=6 bits), declare identical.
    threshold = 6
    matches = 0
    for h1 in hashes1:
        h1_hash = h1["hash"]
        # Find min hamming dist for this h1 to any in hashes2
        min_dist = min(hamming_distance(h1_hash, h2["hash"]) for h2 in hashes2)
        if min_dist <= threshold:
            matches += 1

    match_fraction = matches / max(len(hashes1), 1)
    # You can adjust fraction (e.g., 0.7 == "70% of hashes match closely")
    return matches 


In [None]:
# --- Example Usage ---
video_file = "C:/Users/shels/Documents/wezareit el dakhleya/dakhleyaVideos/blue car swerving/blue car swerving.mp4"
fingerprints = generate_video_fingerprint(video_file)
for entry in fingerprints:
   print(f"Time: {entry['timestamp_seconds']}s | Hash: {entry['hash']}")


Time: 0.0s | Hash: 1100011101111000001110001100011101011011001110000000011111000011
Time: 2.0s | Hash: 1001111001010000001000011011110101011110100000000011000111111111
Time: 4.0s | Hash: 1001111001101000011000111001100110001100011001101101001010011101
Time: 6.0s | Hash: 1000100101110110001101001100100100100110001101001101110111001011
Time: 8.0s | Hash: 1011011101101100010000001010101110101101110101100101001000110001
Time: 10.0s | Hash: 1101001100111110011000011000010110011110011000110011111011000000
Time: 12.0s | Hash: 1100001100110100001111001101100011100110001001010001100111111010
Time: 14.0s | Hash: 1101110000100011000000111111110001001100000000010111111111101100
Time: 16.0s | Hash: 1101110100101001000000001110011011111111100111010000000001100111
Time: 18.0s | Hash: 1100010000111111001110111101000001000100110011110011101101100000
Time: 20.0s | Hash: 1100001100110100001111011100101111000010001101100001100111001011


In [7]:
print(are_identical(video_file,video_file))

6
