In [None]:
import re
import requests as rq
from urllib.parse import urljoin
import bs4
from bs4 import BeautifulSoup
from pathlib import Path
import os
import glob

import json

In [None]:
def process_files_to_json(data_path: str) -> None:
    htmls = glob.glob(f"{data_path}/raw/htmls/*.txt")
    #print(f"Found {len(htmls)} files in path.")

    output_dicts = []
    
    for html_file in htmls:
        abstract_file = html_file.replace("htmls", "abstracts")
        print(html_file, abstract_file)

        with open(html_file, "r", encoding="utf8") as h_f:
            html_content = h_f.read()

        with open(abstract_file, "r", encoding="utf8") as a_f:
            abstract_content = a_f.read()

        try:
            title, text_content = extract_text(html_content)

            html_file_dict = {
                "article_id": html_file.split("\\")[1].replace(".txt", ""),
                "title": title,
                "abstract": abstract_content,
                "text_content": text_content.strip()
            }
            output_dicts.append(html_file_dict)
        except Exception as e:
            print(e)
            continue
            
    #print(output_dicts)
    with open(f"{data_path}/processed/json/processed.json", "w", encoding="utf8") as f:
        for obj in output_dicts:
            json.dump(obj, f, ensure_ascii=False)
            f.write("\n")
        
def get_paper_sections(paper_soup: bs4.element.Tag) -> bs4.element.ResultSet:
    sections = paper_soup.select("section.ltx_section")
    
    return sections


def parse_section(section: bs4.element.Tag) -> tuple:
    section_heading = section.select_one("h2.ltx_title_section").text.strip()
    section_text = section.text
    
    return section_heading, section_text

def extract_text(html_content) -> str:
    soup = BeautifulSoup(html_content, "html")
    title = soup.select_one("h1.ltx_title_document").text
    full_paper = soup.select_one("article.ltx_document")
    sections = get_paper_sections(paper_soup=full_paper)
    
    section_texts = []
    for section in sections:
        section_texts.append(parse_section(section)[1])
    
    full_text = "\n\n".join(section_texts)
    
    return title, full_text

In [None]:
data_path = "../data"
processed_output_path = (Path(data_path)/"processed"/"json")

if not Path(data_path).exists():
    Path(data_path).mkdir()

if not processed_output_path.exists():
    (Path(data_path)/"processed"/"json").mkdir(parents=True)

In [None]:
process_files_to_json(data_path=data_path)