1. Preparations

In [9]:
import evaluation as eval
import numpy as np
import pandas as pd
import os
import json
from enum import Enum

In [10]:
def get_pdf_summary_path() -> str:
    result = os.path.abspath('..\\..\\resources\\benchmark\\pdf_summary')
    return result

def get_pdf_summary_subpath(subpath:str) -> str:
    result = os.path.join(
        get_pdf_summary_path(), 
        subpath
    )
    return result

2. Load Files into dictionaries

In [17]:
class SumAlgo(Enum):
    LSA = 'lsa'
    TEXTRANK = 'textrank'
    REF = 'ref'

def get_file_name_list() -> list[str]:
    list = []
    for (_, _, filenames) in os.walk(get_pdf_summary_subpath(SumAlgo.REF.value)):
        for filename in filenames:
            list.append(filename)
    return list

def get_file_path_list(sumAlgo:SumAlgo) -> list[str]:
    list = []
    for (dirpath, _, filenames) in os.walk(get_pdf_summary_subpath(sumAlgo.value)):
        for filename in filenames:
            list.append(os.path.join(dirpath, filename))
    return list

def get_file_dict(sumAlgo:SumAlgo) -> dict:
    path_list = get_file_path_list(sumAlgo) 
    file_name_list = get_file_name_list()
    result = {}
    for i in range(0, len(file_name_list)):
        path = path_list[i]
        file_name = file_name_list[i]
        result[file_name] = read_file(path)
    return result 

def read_file(path:str) -> any:
    result = ''
    with open(path, 'r') as f:
        result = json.load(f)
    return result

def get_segment_dict(sumAlgo:SumAlgo) -> dict:
    file_dict = get_file_dict(sumAlgo)
    file_name = get_file_name_list()
    result = {}
    for file_name in file_name_list:
        value = file_dict[file_name]
        segments = value['segments'] 
        result[file_name] = segments
    return result

ref_segment_dict = get_segment_dict(SumAlgo.REF)
textrank_segment_dict = get_segment_dict(SumAlgo.TEXTRANK)
lsa_segment_dict = get_segment_dict(SumAlgo.LSA)
file_name_list = get_file_name_list()

3. Evaluate

In [18]:
# Mean rouge_1 of all paragraphs
def get_header_dict(segments:list) -> dict:
    result = {}
    for segment in segments:
        header = segment['header']
        content = segment['content']
        result[header] = content
    return result 

def comparison(test_segment_dict:dict, ref_segment_dict:dict, eval_method:eval.EvalMethod) -> pd.DataFrame:
    file_name_list = get_file_name_list()
    for file_name in file_name_list:
        ref_header_dict = get_header_dict(ref_segment_dict[file_name])
        test_header_dict = get_header_dict(test_segment_dict[file_name])

        ref_key_num = len(ref_header_dict.keys())
        test_key_num = len(test_header_dict.keys())
        if(ref_key_num != test_key_num):
            print("The number of keys do not match: " + ref_header_dict.keys())
        
        

        

[{'header': 'INTRODUCTION', 'content': 'The few works that accounted for collisional evolution of the MAB (e.g.; To do that, it is important that we consider not only the MAB total mass, but also its SFD.Only three objects in the current MAB have D > 500 km.'}, {'header': 'MODEL', 'content': "To model the evolution of the MAB SFD and the accretion of planetesimals with D > 500 km in the MAB region we first assume that planetesimals were formed within the first 0.5 Myr after Calcium-Aluminum-Inclusions (CAIs) based on the methods by For the total primordial mass of the MAB we assumed that the initial distribution of planetesimals, uniformly distributed between 1.8 and 3.6 au The exact time and radial distance of Jupiter's formation is unknown (e.g.; Chambers 2021; We followed the accretion of objects within our MAB region for 5 Myr. This interval is presumably the time the gas in the solar nebula dispersed in the outer solar system based on the relative ages of the youngest CB-chondrite