In [92]:
from pathlib import Path
from typing import Optional

import pandas as pd

prominent_peak_report_path = "/home/vedant/Desktop/glimpsify/most_info_frame_extractor/experiments/28/evaluation_report.csv"
k_transactions_report_path = "/home/vedant/Desktop/glimpsify/most_info_frame_extractor/experiments/28/evaluation_report_k_trans_v3.csv"
comparison_report_path = "/home/vedant/Desktop/glimpsify/most_info_frame_extractor/experiments/29/comparison_report_peak_vs_k_trans.csv"

Path(comparison_report_path).parent.mkdir(parents=True, exist_ok=True)


def remove_outliers(df, col, suffix: Optional[str] = None):
    if not suffix:
        suffix = ""

    Q1 = df[col + suffix].quantile(0.25)
    Q3 = df[col + suffix].quantile(0.75)
    IQR = Q3 - Q1

    multiplier = 1
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR
    df = df[(df[col + suffix] >= lower_bound) & (df[col + suffix] <= upper_bound)]
    return df


def compare_extraction_strategies(path1: str, path2: str, lsuffix="_first", rsuffix="_second", should_remove_outliers=False):
    df1 = pd.read_csv(path1)
    df2 = pd.read_csv(path2)

    if should_remove_outliers:
        df1 = remove_outliers(df1, "similarity_score")
        df2 = remove_outliers(df2, "similarity_score")

    df1.set_index("internal_id", inplace=True)
    df2.set_index("internal_id", inplace=True)

    comparison_df = df1.join(df2, lsuffix=lsuffix, rsuffix=rsuffix, how="inner")
    comparison_df["similarity_score_result"] = comparison_df.apply(
        lambda x: x["similarity_score" + lsuffix] > x["similarity_score" + rsuffix], axis=1)

    new_col_arrangement = []
    cols = df1.columns
    for col in cols:
        new_col_arrangement.extend([col + lsuffix, col + rsuffix])
    comparison_df = comparison_df[new_col_arrangement]

    return comparison_df


In [93]:
comparison_df = compare_extraction_strategies(prominent_peak_report_path, k_transactions_report_path,
                                              lsuffix="_prominent_peaks", rsuffix="_k_transactions", should_remove_outliers=False)
comparison_df.head()

Unnamed: 0_level_0,generated_pdf_path_prominent_peaks,generated_pdf_path_k_transactions,reference_pdf_path_prominent_peaks,reference_pdf_path_k_transactions,num_of_duplicates_prominent_peaks,num_of_duplicates_k_transactions,num_of_missing_key_frames_prominent_peaks,num_of_missing_key_frames_k_transactions,num_of_non_key_frames_prominent_peaks,num_of_non_key_frames_k_transactions,generated_pdf_key_frame_count_prominent_peaks,generated_pdf_key_frame_count_k_transactions,reference_pdf_key_frame_count_prominent_peaks,reference_pdf_key_frame_count_k_transactions,accuracy_prominent_peaks,accuracy_k_transactions,similarity_score_prominent_peaks,similarity_score_k_transactions
internal_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
lhopxd,/home/vedant/Downloads/key-frame-extraction-re...,/home/vedant/Desktop/glimpsify/most_info_frame...,/home/vedant/Downloads/reference-pdfs-v1/lhopx...,/home/vedant/Downloads/reference-pdfs-v1/lhopx...,0,0,3,3,1,1,9,9,11,11,0.727273,0.727273,0.666667,0.666667
wayvbl,/home/vedant/Downloads/key-frame-extraction-re...,/home/vedant/Desktop/glimpsify/most_info_frame...,/home/vedant/Downloads/reference-pdfs-v1/wayvb...,/home/vedant/Downloads/reference-pdfs-v1/wayvb...,54,0,20,39,61,20,145,31,50,50,0.6,0.22,0.27027,0.157143
ruweka,/home/vedant/Downloads/key-frame-extraction-re...,/home/vedant/Desktop/glimpsify/most_info_frame...,/home/vedant/Downloads/reference-pdfs-v1/ruwek...,/home/vedant/Downloads/reference-pdfs-v1/ruwek...,0,0,4,4,6,6,8,8,6,6,0.333333,0.333333,0.166667,0.166667
sgmaal,/home/vedant/Downloads/key-frame-extraction-re...,/home/vedant/Desktop/glimpsify/most_info_frame...,/home/vedant/Downloads/reference-pdfs-v1/sgmaa...,/home/vedant/Downloads/reference-pdfs-v1/sgmaa...,0,0,6,8,2,2,13,11,17,17,0.647059,0.529412,0.578947,0.473684
bmqioa,/home/vedant/Downloads/key-frame-extraction-re...,/home/vedant/Desktop/glimpsify/most_info_frame...,/home/vedant/Downloads/reference-pdfs-v1/bmqio...,/home/vedant/Downloads/reference-pdfs-v1/bmqio...,0,0,3,3,0,0,7,7,10,10,0.7,0.7,0.7,0.7


In [94]:
comparison_df.to_csv(comparison_report_path)

In [95]:
comparison_df.describe()

Unnamed: 0,num_of_duplicates_prominent_peaks,num_of_duplicates_k_transactions,num_of_missing_key_frames_prominent_peaks,num_of_missing_key_frames_k_transactions,num_of_non_key_frames_prominent_peaks,num_of_non_key_frames_k_transactions,generated_pdf_key_frame_count_prominent_peaks,generated_pdf_key_frame_count_k_transactions,reference_pdf_key_frame_count_prominent_peaks,reference_pdf_key_frame_count_k_transactions,accuracy_prominent_peaks,accuracy_k_transactions,similarity_score_prominent_peaks,similarity_score_k_transactions
count,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0
mean,4.25,0.727273,10.818182,13.931818,8.056818,3.965909,23.136364,12.409091,21.647727,21.647727,0.505343,0.424861,0.402814,0.350483
std,10.500411,1.161917,10.802879,13.828004,14.501176,3.448967,27.64021,6.426662,17.928363,17.928363,0.175364,0.176754,0.156656,0.156429
min,0.0,0.0,1.0,0.0,0.0,0.0,2.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,4.0,4.75,1.0,1.0,7.0,8.0,10.0,10.0,0.402056,0.307692,0.291429,0.237782
50%,1.0,0.0,7.5,9.0,3.5,3.0,13.0,10.0,15.0,15.0,0.518445,0.410428,0.403704,0.333333
75%,2.0,1.0,12.25,17.25,10.0,6.0,24.5,14.25,27.25,27.25,0.629464,0.538462,0.5,0.457576
max,54.0,7.0,65.0,79.0,111.0,20.0,145.0,34.0,105.0,105.0,0.897436,1.0,0.833333,0.833333
