In [3]:

from pathlib import Path
from typing import List

import pandas as pd


In [4]:
from video2pdf.utils.evaluation import get_pdf_files, build_pdf_files_df, clean_df

In [5]:
reference_pdfs_folder_path = Path(r"/home/vedant/Downloads/reference-pdfs-v1")
generated_pdfs_folder_path = Path(r"/home/vedant/Downloads/key-frame-extraction-results-v1/result-pdfs")

In [6]:
# ---- Get pdf files from folder
reference_pdf_files = get_pdf_files(reference_pdfs_folder_path)
generated_pdf_files = get_pdf_files(generated_pdfs_folder_path)

# ---- Build df for the pdf files
# ---- Columns would be `internal_id`, `pdf_path`
reference_pdf_files_df = build_pdf_files_df(reference_pdf_files)
generated_pdf_files_df = build_pdf_files_df(generated_pdf_files)

# ---- Clean df; drop duplicate pdfs for same internal_id
reference_pdf_files_df = clean_df(reference_pdf_files_df)
generated_pdf_files_df = clean_df(generated_pdf_files_df)

# ---- Keep only those generated pdfs for which we have reference pdfs
generated_pdf_files_subset_df = generated_pdf_files_df[
    generated_pdf_files_df["internal_id"].isin(reference_pdf_files_df["internal_id"])]

In [8]:
folder = "zgpqgn"
reference_pdf_files_df[reference_pdf_files_df["internal_id"] == folder].empty


False

In [145]:
generated_pdf_files_subset_df.set_index("internal_id", inplace=True)
reference_pdf_files_df.set_index("internal_id", inplace=True)

In [146]:
generated_pdf_files_subset_df.head()

Unnamed: 0_level_0,pdf_path
internal_id,Unnamed: 1_level_1
lhopxd,/home/vedant/Downloads/key-frame-extraction-re...
wayvbl,/home/vedant/Downloads/key-frame-extraction-re...
ruweka,/home/vedant/Downloads/key-frame-extraction-re...
sgmaal,/home/vedant/Downloads/key-frame-extraction-re...
bmqioa,/home/vedant/Downloads/key-frame-extraction-re...


In [147]:
reference_pdf_files_df.head()

Unnamed: 0_level_0,pdf_path
internal_id,Unnamed: 1_level_1
zgpqgn,/home/vedant/Downloads/reference-pdfs-v1/zgpqg...
olaqxj,/home/vedant/Downloads/reference-pdfs-v1/olaqx...
aqeiwr,/home/vedant/Downloads/reference-pdfs-v1/aqeiw...
xpetji,/home/vedant/Downloads/reference-pdfs-v1/xpetj...
xtlxph,/home/vedant/Downloads/reference-pdfs-v1/xtlxp...


In [148]:
evaluation_data_df = generated_pdf_files_subset_df.join(reference_pdf_files_df, on="internal_id", lsuffix="_generated",
                                                        rsuffix="_reference")
evaluation_data_df.reset_index(inplace=True)
evaluation_data_df.head()

Unnamed: 0,internal_id,pdf_path_generated,pdf_path_reference
0,lhopxd,/home/vedant/Downloads/key-frame-extraction-re...,/home/vedant/Downloads/reference-pdfs-v1/lhopx...
1,wayvbl,/home/vedant/Downloads/key-frame-extraction-re...,/home/vedant/Downloads/reference-pdfs-v1/wayvb...
2,ruweka,/home/vedant/Downloads/key-frame-extraction-re...,/home/vedant/Downloads/reference-pdfs-v1/ruwek...
3,sgmaal,/home/vedant/Downloads/key-frame-extraction-re...,/home/vedant/Downloads/reference-pdfs-v1/sgmaa...
4,bmqioa,/home/vedant/Downloads/key-frame-extraction-re...,/home/vedant/Downloads/reference-pdfs-v1/bmqio...


In [149]:
evaluation_data_df.columns

Index(['internal_id', 'pdf_path_generated', 'pdf_path_reference'], dtype='object')

In [150]:
# ----- Init evaluation report df
evaluation_report_cols = ['internal_id',
                          'num_of_duplicates',
                          'num_of_missing_key_frames',
                          'num_of_non_key_frames', 'generated_pdf_key_frame_count', 'reference_pdf_key_frame_count',
                          'accuracy', 'similarity_score']
evaluation_report_df = pd.DataFrame(columns=[evaluation_report_cols])

# ---- Build evaluation report
for internal_id, generated_pdf_path, reference_pdf_path in evaluation_data_df.values:
    break

In [160]:
import pandas as pd
import fitz  # PyMuPDF
from PIL import Image
import imagehash
import io
# from collections import Counter # Potentially useful, but set operations are more direct

# ----- Init evaluation report df
# Define the columns for the evaluation report DataFrame
evaluation_report_cols = ['internal_id',
                          'generated_pdf_path',
                          'reference_pdf_path',
                          'num_of_duplicates',
                          'num_of_missing_key_frames',
                          'num_of_non_key_frames',
                          'generated_pdf_key_frame_count',
                          'reference_pdf_key_frame_count',
                          'accuracy',
                          'similarity_score']
# Initialize an empty DataFrame to store evaluation results
evaluation_report_df = pd.DataFrame(columns=evaluation_report_cols)

def extract_image_hashes_from_pdf(pdf_path: str) -> list:
    """
    Extracts images from each page of a PDF and computes their perceptual hashes (pHash).
    pHash is used because it identifies images that are visually similar,
    which is suitable for lecture slides where content might have minor variations
    but should be considered the same if visually identical.
    Assumes one image per page.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        list: A list of imagehash.ImageHash objects.
    """
    hashes = []
    try:
        pdf_document = fitz.open(pdf_path)
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            # Render page to a pixmap (an image representation)
            pix = page.get_pixmap()
            # Convert pixmap to PNG image bytes
            img_bytes = pix.tobytes("png")
            # Open image bytes with Pillow
            pil_image = Image.open(io.BytesIO(img_bytes))

            # Convert to RGB if it's RGBA, P (palette), or LA (Luminance Alpha)
            # to ensure consistency for hashing, as pHash works best on RGB.
            if pil_image.mode in ('RGBA', 'P', 'LA'):
                pil_image = pil_image.convert('RGB')

            # Compute perceptual hash (pHash)
            img_hash = imagehash.phash(pil_image)
            hashes.append(img_hash)
        pdf_document.close()
    except Exception as e:
        print(f"Error processing PDF {pdf_path}: {e}")
        # Return empty list if a PDF is corrupt, unreadable, or contains no valid images
    return hashes

# ---- Build evaluation report

# Placeholder for evaluation_data_df.
# This DataFrame should be populated with your actual data,
# containing 'internal_id', 'generated_pdf_path', and 'reference_pdf_path' for each video.
# Example:
# placeholder_data = {
#     'internal_id': ['video_001_lecture_A', 'video_002_lecture_B'],
#     'generated_pdf_path': ['/path/to/your/generated_video_001.pdf', '/path/to/your/generated_video_002.pdf'],
#     'reference_pdf_path': ['/path/to/your/reference_video_001.pdf', '/path/to/your/reference_video_002.pdf']
# }
# evaluation_data_df = pd.DataFrame(placeholder_data)

# If evaluation_data_df is not defined, create a dummy one for the script to run without error.
if 'evaluation_data_df' not in globals():
    print("Warning: evaluation_data_df is not defined. Using an empty placeholder.")
    evaluation_data_df = pd.DataFrame(columns=['internal_id', 'generated_pdf_path', 'reference_pdf_path'])


# List to store data for new rows, to be concatenated later for efficiency
new_rows_list = []

for index, row_data in evaluation_data_df.iterrows():
    internal_id = row_data['internal_id']
    generated_pdf_path = row_data['pdf_path_generated']
    reference_pdf_path = row_data['pdf_path_reference']

    print(f"Processing evaluation for: {internal_id}")

    # Extract perceptual hashes from generated and reference PDFs
    # These hashes represent the visual content of each slide.
    gen_hashes_list = extract_image_hashes_from_pdf(generated_pdf_path)
    ref_hashes_list = extract_image_hashes_from_pdf(reference_pdf_path)

    # Total number of frames (images/pages) in the generated PDF
    generated_pdf_key_frame_count = len(gen_hashes_list)

    # Create sets of hashes for efficient comparison.
    # For reference, assume it's already curated (no duplicates), but set ensures uniqueness.
    ref_hashes_set = set(ref_hashes_list)
    reference_pdf_key_frame_count = len(ref_hashes_set) # Number of unique reference frames

    # Unique hashes from the generated PDF
    unique_gen_hashes_set = set(gen_hashes_list)
    num_unique_generated_frames = len(unique_gen_hashes_set)

    # METRIC 1: Number of duplicates in the generated PDF
    # Duplicates are extra occurrences of the same visual slide.
    num_of_duplicates = generated_pdf_key_frame_count - num_unique_generated_frames

    # Identify True Positives: Unique generated frames that are also in reference frames.
    # These are the correctly identified keyframes.
    true_positives_hashes_set = unique_gen_hashes_set.intersection(ref_hashes_set)
    num_true_positives = len(true_positives_hashes_set)

    # METRIC 2: Number of missing keyframes
    # These are frames present in the reference set but not found in the unique generated set (False Negatives).
    missed_hashes_set = ref_hashes_set.difference(unique_gen_hashes_set)
    num_of_missing_key_frames = len(missed_hashes_set)

    # METRIC 3: Number of non-key frames (False Positives)
    # These are frames present in the unique generated set but not found in the reference set.
    # These are incorrectly identified as keyframes.
    non_key_frame_hashes_set = unique_gen_hashes_set.difference(ref_hashes_set)
    num_of_non_key_frames = len(non_key_frame_hashes_set)

    # METRIC 4: Accuracy
    # Defined as: (number of correctly identified unique keyframes) / (total number of actual keyframes)
    # The numerator (generated_pdf_key_frame_count - num_of_duplicates - num_of_non_key_frames)
    # simplifies to num_true_positives.
    if reference_pdf_key_frame_count > 0:
        accuracy = num_true_positives / reference_pdf_key_frame_count
    else:
        # If there are no reference keyframes:
        # Accuracy is 1.0 if no frames were generated (correctly identified nothing).
        # Accuracy is 0.0 if any frames were generated (all would be non-keyframes).
        accuracy = 1.0 if num_unique_generated_frames == 0 else 0.0

    # METRIC 5: Similarity Score (Jaccard Index)
    # J(A,B) = |A ∩ B| / |A ∪ B|
    # A = unique_gen_hashes_set, B = ref_hashes_set
    # Intersection = num_true_positives
    # Union = total unique items in either set
    union_hashes_set = unique_gen_hashes_set.union(ref_hashes_set)
    num_union_hashes = len(union_hashes_set)

    if num_union_hashes > 0:
        similarity_score = num_true_positives / num_union_hashes
    else:
        # If both sets are empty (no generated frames and no reference frames),
        # their Jaccard index is 1.0 (perfectly similar in their emptiness).
        similarity_score = 1.0


    # Store the calculated results for the current item
    current_eval_results = {
        'internal_id': internal_id,
        'generated_pdf_path': generated_pdf_path,
        'reference_pdf_path': reference_pdf_path,
        'num_of_duplicates': num_of_duplicates,
        'num_of_missing_key_frames': num_of_missing_key_frames,
        'num_of_non_key_frames': num_of_non_key_frames,
        'generated_pdf_key_frame_count': generated_pdf_key_frame_count,
        'reference_pdf_key_frame_count': reference_pdf_key_frame_count,
        'accuracy': accuracy,
        'similarity_score': similarity_score
    }
    new_rows_list.append(current_eval_results)

# Concatenate all new rows to the main DataFrame at once for better performance
if new_rows_list:
    evaluation_report_df = pd.concat([evaluation_report_df, pd.DataFrame(new_rows_list)], ignore_index=True)

Processing evaluation for: lhopxd
Processing evaluation for: wayvbl
Processing evaluation for: ruweka
Processing evaluation for: sgmaal
Processing evaluation for: bmqioa
Processing evaluation for: xpetji
Processing evaluation for: qzxibg
Processing evaluation for: czeswe
Processing evaluation for: vhghaj
Processing evaluation for: meatkb
Processing evaluation for: srmqde
Processing evaluation for: hhboxg
Processing evaluation for: mfewuh
Processing evaluation for: wdvnza
Processing evaluation for: oyxwwk
Processing evaluation for: apivsf
Processing evaluation for: hydixc
Processing evaluation for: pssllz
Processing evaluation for: tpfibt
Processing evaluation for: fxddqo
Processing evaluation for: cyqohz
Processing evaluation for: yaldqc
Processing evaluation for: tvtjdv
Processing evaluation for: yysaqx
Processing evaluation for: obwxrn
Processing evaluation for: oogxef
Processing evaluation for: txxahm
Processing evaluation for: yecmvl
Processing evaluation for: aqqzpy
Processing eva

  evaluation_report_df = pd.concat([evaluation_report_df, pd.DataFrame(new_rows_list)], ignore_index=True)


In [159]:
evaluation_report_df.head()

Unnamed: 0,internal_id,generated_pdf_path,reference_pdf_path,num_of_duplicates,num_of_missing_key_frames,num_of_non_key_frames,generated_pdf_key_frame_count,reference_pdf_key_frame_count,accuracy,similarity_score
0,lhopxd,/home/vedant/Downloads/key-frame-extraction-re...,/home/vedant/Downloads/reference-pdfs-v1/lhopx...,0,3,1,9,11,0.727273,0.666667


In [162]:
output_path = Path(r"./28/evaluation_report_v2_k_trans.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)

evaluation_report_df.to_csv(output_path, index=False)

In [169]:
# Replace 'your_id_here' with the specific internal_id you're looking for
filtered_row = evaluation_report_df[evaluation_report_df['internal_id'] == 'gxtjwd']

# Display the result
filtered_row

Unnamed: 0,internal_id,generated_pdf_path,reference_pdf_path,num_of_duplicates,num_of_missing_key_frames,num_of_non_key_frames,generated_pdf_key_frame_count,reference_pdf_key_frame_count,accuracy,similarity_score
86,gxtjwd,/home/vedant/Downloads/key-frame-extraction-re...,/home/vedant/Downloads/reference-pdfs-v1/gxtjw...,0,8,6,8,10,0.2,0.125


NameError: name 'reference_pdf_files_df' is not defined