In [2]:
import pandas as pd
import re

In [3]:
def process_stackexchange(board: str) -> list[pd.DataFrame]:
    """_summary_

    Args:
        board (str): Stackexchange board to load. Expected file structure is 
            ./data/`board`/*.xml

    Returns:
        list[pd.DataFrame]: List of pandas dataframes with Comments and 
            PostHistory data with extracted DOIs and markdown links.
    """
    # Regex patterns
    DOI_PATTERN = "(10\.\d{4,9}/[-._;()/:A-Z0-9]+[/A-Z0-9])" 
    MD_PATTERN = "\[([\w\s\d]+)\]\((https?:\/\/[\w\d./?=#]+)\)"

    def process_set(set: str) -> pd.DataFrame:
        """Processes single file in board data

        Args:
            set (str): One of "Comments" or "PostHistory"

        Returns:
            pd.DataFrame: Original data with extracted DOIs and markdown links
        """       
        # Load xml file 
        path = f"data/{board}/{set}.xml"
        with open(path, encoding="utf8") as file:
            df = pd.read_xml(file.read())

        # Extract DOIs
        df["DOIs"] = df["Text"].str.extract(pat=DOI_PATTERN, flags=re.IGNORECASE)

        # Extract markdown links
        markdown_df = df["Text"].str.extract(pat=MD_PATTERN)
        markdown_df.columns = ["LinkTitle", "LinkURL"]

        return pd.concat([df, markdown_df], axis=1)


    return process_set("Comments"), process_set("PostHistory")


In [5]:
comments_df, posthistory_df = process_stackexchange("stats")

In [6]:
comments_df["DOIs"][~comments_df["DOIs"].isna()]

1052               10.1145/1508128.1508139
2029                    10.1007/BF01897163
2169         10.1016/S0378-3758(97)00050-5
3381                    10.1007/BF02480942
3382                       10.1137/0118065
                        ...               
708321    10.1111/j.1467-9892.2012.00819.x
708440            10.1177/1094428106296639
708739                  10.1093/pan/mpl013
709076        10.1016/j.intell.2014.05.007
709247         10.1016/j.ssmph.2019.100526
Name: DOIs, Length: 1701, dtype: object

In [16]:
posthistory_df.sort_values('CreationDate').drop_duplicates(subset=["DOIs"])

Unnamed: 0,Id,PostHistoryTypeId,PostId,RevisionGUID,CreationDate,UserId,Text,ContentLicense,Comment,UserDisplayName,DOIs,LinkTitle,LinkURL
42530,47930,1,16198,f2d5ed7a-52d0-4831-8056-d337bd96ac72,2009-02-02T14:21:12.103,112726.0,What is an Average that does not include outli...,CC BY-SA 2.5,,Tawani,,,
649,685,2,321,4e355f03-d3bf-4665-a1e7-c971a64ccb82,2010-07-20T16:01:25.590,220.0,There is a variant of boosting called [gentleb...,CC BY-SA 2.5,,,10.1214/aos/1016218223,,
932,1009,2,473,c7d165c3-d32d-4735-bdfb-0edbda47109e,2010-07-22T02:37:18.820,260.0,The traditional solution to this problem is to...,CC BY-SA 2.5,,,10.1137/S0036144598347035,,
948,1026,5,483,e7232a29-6bea-4ea0-ac51-858f870898a3,2010-07-22T10:04:38.557,,"The main idea is the bagging procedure, not ma...",CC BY-SA 2.5,added 303 characters in body,user88,10.1007/s10994-006-6226-1,,
1123,1207,2,573,146975c9-a714-4bbd-8870-2800eb46f32d,2010-07-23T19:45:45.270,190.0,"In ""[Convolutional deep belief networks for sc...",CC BY-SA 2.5,,,10.1145/1553374.1553453,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1344435,2180684,2,566611,18ba5d12-3a99-42bf-b66c-538ee51c8dc0,2022-03-04T09:12:38.957,351083.0,I am trying to carry out a Monte Carlo simulat...,CC BY-SA 4.0,,,10.1038/nrn3475,,
1344460,2180721,2,566620,fc718bf5-ad5a-42fa-a7d0-ce69173ae1e3,2022-03-04T10:09:04.723,240280.0,"According to [1], $P(s=1|z)$ --- the probabili...",CC BY-SA 4.0,,,10.1007/978-3-540-87987-9_8,,
1344740,2181117,2,566699,30720f0e-93f1-44d6-8790-55dd78f360df,2022-03-04T23:29:54.257,143790.0,Is there an implementation of the Behrens-Fish...,CC BY-SA 4.0,,,10.1007/978-3-030-42196-0_9,,
1344939,2181519,5,566656,be22fb5e-c0e9-499d-9697-25c491629d5e,2022-03-05T15:40:41.553,28500.0,In H.Putter & H.C. van Houwelingen's paper\r\n...,CC BY-SA 4.0,added link to paper,,10.1007/s12561-016-9157-9,,
