In [1]:
from pathlib import Path

from video2pdf.utils.evaluation import get_pdf_files, build_pdf_files_df, clean_df, evaluate

In [2]:
reference_pdfs_folder_path = Path(
    "/home/vedant/Desktop/glimpsify/most_info_frame_extractor/video2pdf/archives/data_archive_52_new_reference_pdfs_pdfs")
generated_pdfs_folder_path = Path(
    "/home/vedant/Desktop/glimpsify/most_info_frame_extractor/video2pdf/archives/data_archive_50_output_all_dirs_phash_approval_strategy_pdfs")

In [4]:
def main():
    """Core function to create evaluation report"""

    version_number = 2
    extraction_strategy_name = "prominent_peaks__phash"
    output_path = f"./28/evaluation_report_{extraction_strategy_name}_v{version_number}.csv"
    output_path = Path(output_path)

    # ---- Get pdf files from folder
    reference_pdf_files = get_pdf_files(reference_pdfs_folder_path)
    generated_pdf_files = get_pdf_files(generated_pdfs_folder_path)

    # ---- Build df for the pdf files
    # ---- Columns would be `internal_id`, `pdf_path`
    reference_pdf_files_df = build_pdf_files_df(reference_pdf_files)
    generated_pdf_files_df = build_pdf_files_df(generated_pdf_files)

    # ---- Clean df; drop duplicate pdfs for same internal_id
    reference_pdf_files_df = clean_df(reference_pdf_files_df)
    generated_pdf_files_df = clean_df(generated_pdf_files_df)

    # ---- Keep only those generated pdfs for which we have reference pdfs
    generated_pdf_files_subset_df = generated_pdf_files_df[
        generated_pdf_files_df["internal_id"].isin(reference_pdf_files_df["internal_id"])]

    # ---- Set index for performing join operation later
    generated_pdf_files_subset_df.set_index("internal_id", inplace=True)
    reference_pdf_files_df.set_index("internal_id", inplace=True)

    # ---- Assemble data as a df
    evaluation_data_df = generated_pdf_files_subset_df.join(reference_pdf_files_df, on="internal_id",
                                                            lsuffix="_generated",
                                                            rsuffix="_reference")
    # ---- Reset index so that internal_id key is available as column
    evaluation_data_df.reset_index(inplace=True)
    evaluation_report_df = evaluate(evaluation_data_df)

    # ---- Save evaluation report
    output_path.parent.mkdir(parents=True, exist_ok=True)
    evaluation_report_df.to_csv(output_path, index=False)
    return evaluation_report_df

In [5]:
evaluation_report_df = main()
evaluation_report_df.head()

Processing evaluation for: wnhdmj
Processing evaluation for: dxeqwb
Processing evaluation for: aqqzpy
Processing evaluation for: cruilc
Processing evaluation for: yaldqc
Processing evaluation for: hrpnxe
Processing evaluation for: sdipnw
Processing evaluation for: ruquuc
Processing evaluation for: ozygri
Processing evaluation for: qzxibg
Processing evaluation for: vhghaj
Processing evaluation for: yhiddy
Processing evaluation for: mhhqzg
Processing evaluation for: sosjvp
Processing evaluation for: rgqydw
Processing evaluation for: czeswe
Processing evaluation for: bqzprh
Processing evaluation for: pdddxw
Processing evaluation for: tvtjdv
Processing evaluation for: yirczy
Processing evaluation for: hhboxg
Processing evaluation for: iqntxc
Processing evaluation for: lxwqhv
Processing evaluation for: znavfo
Processing evaluation for: tpfibt
Processing evaluation for: wdvnza
Processing evaluation for: fcxvmr
Processing evaluation for: xiietr
Processing evaluation for: xpetji
Processing eva

  evaluation_report_df = pd.concat([evaluation_report_df, pd.DataFrame(new_rows_list)], ignore_index=True)


Unnamed: 0,internal_id,num_of_duplicates,num_of_missing_key_frames,num_of_non_key_frames,generated_pdf_key_frame_count,reference_pdf_key_frame_count,accuracy,precision,similarity_score
0,wnhdmj,0,14,1,9,22,0.363636,0.888889,0.347826
1,dxeqwb,0,11,4,17,24,0.541667,0.764706,0.464286
2,aqqzpy,0,6,1,4,9,0.333333,0.75,0.3
3,cruilc,2,6,2,10,12,0.5,0.6,0.428571
4,yaldqc,2,11,0,14,23,0.521739,0.857143,0.521739


In [5]:
evaluation_report_df.head()

Unnamed: 0,internal_id,num_of_duplicates,num_of_missing_key_frames,num_of_non_key_frames,generated_pdf_key_frame_count,reference_pdf_key_frame_count,accuracy,precision,similarity_score
0,wnhdmj,0,14,1,9,22,0.363636,0.888889,0.347826
1,dxeqwb,0,11,4,17,24,0.541667,0.764706,0.464286
2,aqqzpy,0,6,1,4,9,0.333333,0.75,0.3
3,cruilc,2,6,2,10,12,0.5,0.6,0.428571
4,yaldqc,2,11,0,14,23,0.521739,0.857143,0.521739
