In [34]:
import pandas as pd
import re

In [45]:
def process_stackexchange(board: str) -> list[pd.DataFrame]:
    """_summary_

    Args:
        board (str): Stackexchange board to load. Expected file structure is 
            ./data/`board`/*.xml

    Returns:
        list[pd.DataFrame]: List of pandas dataframes with Comments and 
            PostHistory data with extracted DOIs and markdown links.
    """
    # Regex patterns
    DOI_PATTERN = "(10\.\d{4,9}/[-._;()/:A-Z0-9]+[/A-Z0-9])" 
    MD_PATTERN = "\[([\w\s\d]+)\]\((https?:\/\/[\w\d./?=#]+)\)"

    def process_set(set: str) -> pd.DataFrame:
        """Processes single file in board data

        Args:
            set (str): One of "Comments" or "PostHistory"

        Returns:
            pd.DataFrame: Original data with extracted DOIs and markdown links
        """       
        # Load xml file 
        path = f"data/{board}/{set}.xml"
        with open(path, encoding="utf8") as file:
            df = pd.read_xml(file.read())

        # Extract DOIs
        df["DOIs"] = df["Text"].str.extract(pat=DOI_PATTERN, flags=re.IGNORECASE)

        # Extract markdown links
        markdown_df = df["Text"].str.extract(pat=MD_PATTERN)
        markdown_df.columns = ["LinkTitle", "LinkURL"]

        return pd.concat([df, markdown_df], axis=1)


    return process_set("Comments"), process_set("PostHistory")


In [46]:
comments_df, posthistory_df = process_stackexchange("ai")

In [47]:
comments_df["DOIs"][~comments_df["DOIs"].isna()]

58            10.3389/fnins.2015.00217
4655      10.1371/journal.pcbi.1003024
6142         10.1007/s10994-005-0916-y
7528       10.1007/978-3-319-77553-1_9
8211     10.1007/978-3-540-85984-0_113
11446          10.1145/3379247.3379276
11449    10.3389/fninf.2019.00053/full
12377        10.1007/s00766-007-0045-1
12517        10.1109/ICRA.2018.8461044
13713             10.1162/NECO_a_00949
14450             10.1162/isal_a_00197
14868        10.1186/s40537-019-0197-0
15063     10.1007/978-3-642-35289-8_36
15369           10.1007/BF00992696.pdf
15546           10.1007/BF00849065.pdf
16482          10.1145/2212908.2212954
16988        10.1016/j.tcs.2009.08.027
20067        10.1109/TEVC.2021.3060014
20177    10.1080/13658816.2018.1542698
20715            10.1145/321186.321193
21306           10.1186/1471-2105-11-9
Name: DOIs, dtype: object