In [49]:
import json
import sys
from functools import partial
sys.path.append("../") 
from src.data.utils import read_jsonl, extract_text_by_headers_html, dedup_results, HTML_PARSER, FORMATTING_FNS, merge_small_splits, split_string_to_segments

In [50]:
dataset = []
with open("../data/raw/task1/crag_task_1_v2.jsonl") as f:
    for line in f:
        json_obj = json.loads(line)
        dataset.append(json_obj)


In [72]:
from bs4 import BeautifulSoup

def split_string_to_segments(string, max_string_length):
    if "\n" in string:
        string_splits = string.split("\n")
        # small_segments_merged = merge_small_splits(string_splits, max_string_length)
        half_num = len(string_splits) // 2
        return ["\n".join(string_splits[:half_num]), "\n".join(string_splits[half_num:])]
    elif " " in string:
        string_splits = string.split(" ")
        half_num = len(string_splits) // 2
        return [" ".join(string_splits[:half_num]), " ".join(string_splits[half_num:])]
    else:
        return [string[:len(string) // 2], string[len(string) // 2:]]


def merge_small_splits(splits, max_string_length):
    new_splits = []
    i = 0
    while i < len(splits):
        split = splits[i]
        if i + 1 == len(splits) or split.startswith("#"):
            new_splits.append(split)
            i += 1
            continue
        next_split = splits[i+1]
        if len(split) < len(next_split) and len(split) < 100 and len(split) + len(next_split) < max_string_length:
            new_splits.append(split + " " + next_split)
            i += 2
        else:
            new_splits.append(split)
            i += 1
    return new_splits

def heuristic_clean_html(html_data):
    soup = BeautifulSoup(html_data, 'html.parser')
    for s in soup.select('script'):
        s.extract()
    for s in soup.select('style'):
        s.extract()
    for s in soup.select('link'):
        s.extract()
    for s in soup.select('footer'):
        s.extract()
    for s in soup.select('header'):
        s.extract()
    root = soup
    while True:
        root_len = len(root.get_text())
        root_found = False
        for tag in root:
            if len(tag.get_text()) > root_len * 0.8:
                root = tag
                root_found = True
                break
        if not root_found:
            break
    return str(root)


def html2text_parser(json_obj, version="v2", max_string_length=2000):
    formatting_fn = FORMATTING_FNS[version]
    page_html = json_obj["page_result"]
    page_name = json_obj["page_name"]
    page_html = heuristic_clean_html(page_html)
    result = HTML_PARSER.handle(page_html)
    result_dict = {"": []}
    last_nonempty_header = ""
    current_header = ""
    if "## References[edit]" in result:
        result = result.split("## References[edit]")[0]
    
    merged_splits = merge_small_splits(result.split("\n\n"), max_string_length)
    for split in merged_splits:
        split = split.strip()
        if split.startswith("#"):
            last_nonempty_header = current_header
            current_header = split.replace("#", " ").strip()
            if not current_header:
                current_header = last_nonempty_header
            if current_header not in result_dict:
                result_dict[current_header] = []
        else:
            split = " ".join(split.replace("|", " ").replace("---", " ").split(" "))
            if not split:
                continue
            if len(split) > max_string_length:
                final_splits = []
                further_splits = [split]
                while further_splits:
                    new_splits = split_string_to_segments(further_splits.pop(0), max_string_length)
                    # maybe overlap at some point
                    for new_split in new_splits:
                        if len(new_split) < max_string_length:
                            final_splits.append(new_split)
                        else:
                            further_splits.append(new_split)
                for final_split in final_splits:
                    result_dict[current_header].append(final_split)
            else:
                result_dict[current_header].append(split)
    # for key, values in result_dict.items():
    #     new_values = []
    #     i = 0
    #     while i < 0:
    #         value = values[i]
    #         if len(value) < 50:
    #             new_values.append(value + "\n" + values[i+1])
    #             i += 2
    #         else:
    #             new_values.append(value)
    #             i +=1
    #     result_dict[key] = values
    return formatting_fn(result_dict, page_name)

In [73]:
def check_html_parse(json_obj, search_result_idx):
    search_result = json_obj["search_results"][search_result_idx]
    page_result = search_result["page_result"]
    with open("../data/html/test.html", "w") as f:
        f.write(page_result)
    
    print("../data/html/test.html")
    print(json_obj["query"])
    print(json_obj["answer"])
    print("-"*100)
    for parsed_element in html2text_parser(search_result):
        print(parsed_element)
        print(len(parsed_element))
        print("="*100)

In [74]:
dataset_idx = 0
result_idx = 1
json_obj = dataset[dataset_idx]
print("\n".join([o["page_name"] for o in json_obj["search_results"]]))
check_html_parse(json_obj, result_idx)

NBA All-Time Assist Leaders and Record | Dunkest
Ranking Top 10 Players That Never Made NBA All-Star Team | NBA.com
NBA's Best Players Who Never Made an All-Star Team | News, Scores, ...
NBA All-Time Assists Leaders - National Basketball Association - ESPN
NBA & ABA Career Leaders and Records for Assists | Basketball-...
../data/html/test.html
which player has the most career assists in the nba among players who have never been named to an all-star game?
andre miller has the most career assists in the nba among players who have never been named to an all-star game, with 8,524 assists.
----------------------------------------------------------------------------------------------------
Ranking Top 10 Players That Never Made NBA All-Star Team | NBA.com
:
OrlandoMagic.com's Josh Cohen ranks his 10 best players that never made an NBA
All-Star team (inactive players only). Click through this slideshow for full
analysis and rankings.
247
Ranking Top 10 Players That Never Made NBA All-Star Tea

In [75]:
def heuristic_clean_tag(html_data):
    soup = BeautifulSoup(html_data, 'html.parser')
    for s in soup.select('script'):
        s.extract()
    for s in soup.select('style'):
        s.extract()
    for s in soup.select('link'):
        s.extract()
    for s in soup.select('footer'):
        s.extract()
    for s in soup.select('header'):
        s.extract()
    root = soup
    while True:
        root_len = len(root.text)
        root_found = False
        for tag in root:
            if len(tag.text) > root_len * 0.8:
                root = tag
                root_found = True
                break
        if not root_found:
            break
    return root

cleaned_root = heuristic_clean_tag(json_obj["search_results"][result_idx]["page_result"])

In [76]:
print(HTML_PARSER.handle((str(cleaned_root))))

OrlandoMagic.com's Josh Cohen ranks his 10 best players that never made an NBA
All-Star team (inactive players only). Click through this slideshow for full
analysis and rankings.

Note: The contents of this gallery have not been reviewed or endorsed by the
Orlando Magic and do not reflect the opinions of the Magic’s Basketball
Operations staff, partners or sponsors.

Tied #10 Andre Miller

Several other point guards, including Rod Strickland, Jason Terry, Mike Bibby
and Derek Harper, all deserve to be mentioned on this list. But Miller, who
dished out 8,524 assists in his career – 11th most in NBA history and the most
among players who never made an All-Star team, gets the edge. The 6-foot-3
floor general led the NBA in assists during the 2001-02 season. On Dec. 15 of
that year, Miller handed out 22 dimes against the 76ers. He had 20 assists in
another game against the Heat in 2006.

Three former Magic players – Nick Anderson, Hedo Turkoglu and Jason Richardson
– all warrant considerat

In [48]:
print(len(HTML_PARSER.handle((str(cleaned_root)))))

1041
