## IE Results Viewer
Viewer for IE results on OASIS journal reports. Facilitates adjustment of section and significance scores to affect results ranking.

In [3]:
%%capture
import warnings
# suppress user warnings during execution
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)

# load required dependencies
%pip install --upgrade pip

In [4]:
import pandas as pd  # for DataFrame
from ipywidgets import Dropdown, Button, Output, FloatSlider, Layout
from IPython.display import display, HTML
from html import escape  # for writing escaped HTML

DATA_FOLDER = "./data/oasis/journals_july_2024/text_extraction_20251117/ie-output-20260204/"


def get_input_file_names(folder: str) -> list[str]:
    # get sorted list of all file names in the folder
    all_files = pd.io.common.os.listdir(folder)
    # filtered to CSV files
    csv_files = [f for f in all_files if f.endswith('.csv')]
    csv_files.sort()
    return csv_files


def get_input_file_data(file_name: str) -> list[dict]:
    file_path = pd.io.common.os.path.join(DATA_FOLDER, file_name)
    df = pd.read_csv(file_path, delimiter=",", encoding="utf-8", skip_blank_lines=True)
    # set any NaN values to blank string
    df.fillna("", inplace=True)
    data = df.to_dict(orient='records')
    return data


def get_max_section_score(sections: list[str]) -> float:
    
    def get_section_score(section: str="") -> float:
        sec = section.strip().lower()
        
        if sec == "title":
            return slider_t.value
        elif sec == "abstract":
            return slider_a.value
        elif sec == "body":
            return slider_b.value
        else:
            return 0.0

    scores = [get_section_score(sec) for sec in sections]
    return max(scores) if scores else 0.0


def get_results_by_row(file_name: str) -> pd.DataFrame:
    data = get_input_file_data(file_name)
    for row in data:
        sections = row.get('sections', '').split(',')
        if('end_matter' in sections):
            continue

        sec_score = get_max_section_score(sections)
        sig_proximity = slider_s.value if row.get('sig_proximity', 0.0) > 0.0 else 0.0
        # overriding any existing scores
        row['sec_score'] = sec_score    
        row['sig_proximity'] = sig_proximity
        row['score'] = sec_score + sig_proximity 
        row['score_explain'] = f"({sec_score} + {sig_proximity})"
    df = pd.DataFrame(data) 
    return df  

            
def get_results_by_concept(file_name: str) -> pd.DataFrame:
    sum_by_id = {}
    data = get_input_file_data(file_name)
    for row in data:
        sections = list(row.get('sections', '').split(','))
        if('end_matter' in sections):
            continue
        
        concept_id: str = row.get('id', None)
        if(concept_id or "") == "":
            concept_id = row.get('text', 'n/a').lower()
        if concept_id not in sum_by_id.keys():
            sum_by_id[concept_id] = {
                'id': concept_id,
                'text': [],
                'label': row.get('label', ''),
                'sec_score': 0.0,
                'sig_score': 0.0,
                'score': 0.0,
                'count': 0
            }
        sec_score = get_max_section_score(sections)
        sig_proximity = round(slider_s.value, 2) if row.get('sig_proximity', 0) > 0 else 0
        concept_text = row.get('text', '')
        if concept_text != "" and concept_text not in sum_by_id[concept_id]['text']:
            sum_by_id[concept_id]['text'].append(concept_text)
        sum_by_id[concept_id]['sec_score'] += sec_score
        sum_by_id[concept_id]['sig_score'] += sig_proximity
        sum_by_id[concept_id]['score'] += sec_score + sig_proximity        
        sum_by_id[concept_id]['count'] += 1

    # create clickable links for concept ids that look like URLs, using the concept text as the label
    def make_link(val, lbl):
        if val.startswith("http"):
            return f"<a target='_blank' rel='noopener noreferrer' href='{val}'>{escape(lbl)}</a>"
        else:
            return escape(lbl)
       
    # Convert the dictionary back to a DataFrame, adding combined columns for display
    df = pd.DataFrame(list(sum_by_id.values()))
    df["span"] = df.apply(lambda x: make_link(x['id'], ", ".join(x['text'])), axis=1)
    df["score_explain"] = df.apply(lambda x: f"({round(x['sec_score'], 2)} + {round(x['sig_score'], 2)})", axis=1)
    return df


# creating the UI components
slider_t = FloatSlider(description='Title:', value=40.0, min=0.0, max=50.0, step=0.1, layout=Layout(width='500px'))
slider_a = FloatSlider(description='Abstract:', value=2.0, min=0.0, max=50.0, step=0.1, layout=Layout(width='500px'))
slider_b = FloatSlider(description='Body:', value=0.1, min=0.0, max=50.0, step=0.1, layout=Layout(width='500px'))
slider_s = FloatSlider(description='Significance:', value=2.0, min=0.0, max=50.0, step=0.1, layout=Layout(width='500px'))

filelist = Dropdown(
    options=get_input_file_names(DATA_FOLDER),
    description='Select file:',
    disabled=False,
    layout={'width': 'max-content'}
)
refresh = Button(description="Refresh results")
outputs = Output()

# display all UI components together
display(filelist, slider_t, slider_a, slider_b, slider_s, refresh, outputs)

# refresh results when button is clicked
def on_button_click(b):
    outputs.clear_output()
    selected_file = filelist.value
    if selected_file: 
        #df = get_results_by_row(selected_file)
        df = get_results_by_concept(selected_file)
        df = df.sort_values(by='score', ascending=False).head(20)
        with outputs:
            display(HTML(df.to_html(index=False, render_links=True, escape=False, columns=['span', 'label', 'count', 'sec_score', 'sig_score', 'score'])))
            #display(HTML(df.to_html(index=False, columns=['id', 'text', 'label', 'sections', 'sec_score', 'sig_proximity', 'score', 'score_explain', 'context'])))
refresh.on_click(on_button_click)




Dropdown(description='Select file:', layout=Layout(width='max-content'), options=('ie-output-text-extraction-0â€¦

FloatSlider(value=40.0, description='Title:', layout=Layout(width='500px'), max=50.0)

FloatSlider(value=2.0, description='Abstract:', layout=Layout(width='500px'), max=50.0)

FloatSlider(value=0.1, description='Body:', layout=Layout(width='500px'), max=50.0)

FloatSlider(value=2.0, description='Significance:', layout=Layout(width='500px'), max=50.0)

Button(description='Refresh results', style=ButtonStyle())

Output()