<a href="https://colab.research.google.com/github/amina-mardiyyah/BioHackathon_25/blob/main/Sentencise_BioMedical_Articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#install required dependencies, assuming you have other basic packages installed. Restart the kernel/session after running this cell
!pip install uv
!uv pip install scispacy spacy==3.7.5 "numpy<2"
!uv pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz

Collecting uv
  Downloading uv-0.9.15-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading uv-0.9.15-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.7/21.7 MB[0m [31m61.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uv
Successfully installed uv-0.9.15
[2mUsing Python 3.12.12 environment at: /usr[0m
[2K[2mResolved [1m51 packages[0m [2min 698ms[0m[0m
[2K[2mPrepared [1m10 packages[0m [2min 1.88s[0m[0m
[2mUninstalled [1m4 packages[0m [2min 304ms[0m[0m
[2K[2mInstalled [1m10 packages[0m [2min 78ms[0m[0m
 [31m-[39m [1mblis[0m[2m==1.3.3[0m
 [32m+[39m [1mblis[0m[2m==0.7.11[0m
 [32m+[39m [1mconllu[0m[2m==6.0.0[0m
 [32m+[39m [1mlangcodes[0m[2m==3.5.1[0m
 [32m+[39m [1mnmslib-metabrainz[0m[2m==2.1.3[0m
 [31m-[39m [1mnumpy[0m[2m==2.0.2[0m
 [32m+[39m [1mnumpy[0m[2m==1.26.4[0m
 [32m+[39m [1mp

In [16]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import re
import scispacy
import spacy
from tqdm import tqdm


In [3]:
#import data to be sentencised. For this demo, I'll show how to load fullText XML directly from EPMC, filter out irrelevant tags then sentencise the articles.

#load pmclist:
pmcid_list = [
    "PMC5779697",
    "PMC8259984",
    "PMC10469212",
    "PMC10513930",
    "PMC4684364",
    "PMC11269705",
    "PMC6691112",
    "PMC10630469",
    "PMC4606567",
    "PMC9170690",
    "PMC10902711",
    "PMC11458896",
    "PMC10728066",
    "PMC6097988",
    "PMC9599481",
    "PMC8342504",
    "PMC7417482",
    "PMC5050431",
    "PMC6461633",
    "PMC4629127",
    "PMC8521299",
    "PMC5339821",
    "PMC8755711",
    "PMC8481471",
    "PMC10776612",
    "PMC9837121",
    "PMC5473837",
    "PMC3329542",
    "PMC6629680",
    "PMC5286405",
    "PMC9301695",
    "PMC11413214",
    "PMC8748654",
    "PMC11127996",
    "PMC10491837",
    "PMC7897626",
    "PMC9546883",
    "PMC9357011",
    "PMC6128551",
    "PMC9352691",
    "PMC3954792",
    "PMC11564093",
    "PMC6760179",
    "PMC7592055",
    "PMC7449968",
    "PMC9011034",
    "PMC10175254",
    "PMC4906355"
]
len(pmcid_list)

48

In [10]:
#define a scispacy model
nlp = spacy.load("en_core_sci_md", disable=["tagger", "parser", "ner", "lemmatizer", "attribute_ruler"]) #drop irrelevant parts of the pipe
nlp.add_pipe("sentencizer") #define sentenciser
nlp.max_length = 10_000_000

In [11]:

#function to fetch articles xml
def get_xml(pmcid):
    url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/{pmcid}/fullTextXML"
    try:
        response = requests.get(url)
        response.raise_for_status() # Raise an error if the response code is not 200
        return bs(response.content, "lxml-xml")

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")

    except requests.exceptions.RequestException as req_err:
        print(f"Request error occurred: {req_err}")
    print(f"Failed to retrieve XML file for PMCID: {pmcid}") #articles can be available in other formats other than xml, we can skip these
    return None


def clean_text(text):
  """
  This function cleans text by removing LaTeX-style math and formatting tags,
  reference tags, and extra whitespace.


  """

  # Remove LaTeX-style math and formatting tags
  text = re.sub(r"\{.*?\}", "", text)  # Matches and removes anything inside curly braces {}
  text = re.sub(r"\\[a-zA-Z]+", "", text)  # Matches and removes LaTeX-style commands (e.g., \usepackage)

  # Remove reference tags like [34] or [1,2,3]
  text = re.sub(r"\[\d+(?:,\d+)*\]", "", text)

  # Remove extra whitespace
  text = re.sub(r"\s+", " ", text).strip()

  return text



def filter_tags(soup):
  """
  This function cleans irrevelant tags from an xml file.


  """
  tags2ignore = ['inline-formula', 'supplementary-material', 'ack', 'contrib-group',
                  "disclaimer","Disclosure",
                  'sup', 'Acknowledgments','COI-statement']
  section_keywords = ['Disclaimer', 'author contributions',
                      'Conflict of interest', "Publisher’s note",
                      'Supplementary information', "Supplementary material", "Disclosure",
                      ]

  # Remove tags by name
  for tag_name in tags2ignore:
      for tag in soup.find_all(tag_name):
          tag.decompose()

  # Remove sections containing specific titles
  for sec in soup.find_all('sec'):
      title_tag = sec.find('title')
      if title_tag and any(keyword.strip().lower() in title_tag.text.strip().lower() for keyword in section_keywords):
          sec.decompose()

  return soup


def get_full_text(soup):
    """
    This function extracts the title and full text from an xml file.
    It retrieves the XML using the PMCIDs, cleans the text and removes irrelevant tags.
    Returns the title and full text.
    Args:
      soup: A beautifulsoup element

    Returns:
      title: The cleaned title of the article
      full_text: The cleaned full text of the article

    """
    #define tags to ignore in xml content

    if soup:
        try:
            #filter unwanted tags
            soup = filter_tags(soup)
            #extract article title
            title = soup.find("article-title").text if soup.find("article-title") else "Article is missing a title" #encountered some of these earlier, specifically with books. so catching them here
            #clean title for extra whitespace using clean text fun
            title = clean_text(title)

            #extract abstract
            abstract = soup.find("abstract")
            abstract_text = (" ".join([clean_text(p.text) for p in abstract.find_all("p") if p.text]) if abstract else "")

            #extract body of article

            body_tag = soup.find("body")
            body_text = (" ".join([clean_text(p.text) for p in body_tag.find_all("p")
                                            if p.text]) if body_tag else "")

            full_text = f"{abstract_text}{body_text}".strip()

            return title, full_text


        except Exception as e:
            print(f"Error processing XML: {e}")
            return "Error extracting Article", ""



    else:
        return "No XML provided", ""


def sentencise_articles(row,nlp=nlp):
    """
    Sentencises by splitting into sentences using a spacy biomedical model.
    Args:
      row: A pandas series element
      nlp: A spacy/scispacy biomedical model

    Returns:
      articles: A list of dictionaries containing the PMCID, title, and individual sentences.
      The PMCIDs are attached to track what sentences belong to what article.

    """
    articles = []
    pmcid = row["PMCID"]
    xml = get_xml(pmcid)
    title, full_text = get_full_text(xml)
    if full_text:
        doc = nlp(full_text)
        sentences = [postprocess_sentences(sent.text) for sent in doc.sents]
        for sent in sentences:
            articles.append({"PMCID": pmcid, "Title": title, "Sentences": sent})
    return articles




In [6]:
#other useful functions to speed up processes in the case of large files/articles
from multiprocessing import Pool, cpu_count

def postprocess_sentences(sentence: str) -> str:


    cleaned = re.sub(r"^[()\da-zA-Z]+\)|^\)|^\(|\)[()\da-zA-Z]+$", "", sentence.strip())
    return cleaned.strip()

def parallel_process_articles(df: pd.DataFrame, func: callable, processing_type) -> pd.DataFrame:
    """
    Generic parallel processing function for DataFrame operations.

    Args:
        df (pd.DataFrame): The input DataFrame.
        func (callable): The function to apply to each element or row

    Returns:
        pd.DataFrame: Processed DataFrame.
    """


    data = [row for _, row in df.iterrows()]
    #print(f"And data looks like this: {data[0]}")

    with Pool(cpu_count()) as pool:
        results = list(tqdm(pool.imap(func, data), total=len(data), desc="Extracting and processing Articles"))
        #print(f"While results looks like this: {results}")
    # Flatten lists and filter out malformed results if applicable
    flattened_results = [item for sublist in results if isinstance(sublist, list) for item in sublist]
    if processing_type == 'par':
        columns = ["PMCID", "Title", "Section", "Paragraph_text"]
        articles = pd.DataFrame(flattened_results, columns=columns)
        return articles

    elif processing_type == 'sent':
        return pd.DataFrame(flattened_results)




In [17]:
#extract fulltext xml from EPMC using the PMCIDs and sentencise
pmcid_df = pd.DataFrame(pmcid_list, columns=["PMCID"])
sent_df = parallel_process_articles(pmcid_df,sentencise_articles, processing_type='sent')
print(sent_df.shape)
sent_df.head(10)

Extracting and processing Articles: 100%|██████████| 48/48 [01:30<00:00,  1.89s/it]

(15695, 3)





Unnamed: 0,PMCID,Title,Sentences
0,PMC5779697,Adipose tissue ATGL modifies the cardiac lipid...,Adipose tissue lipolysis occurs during the dev...
1,PMC5779697,Adipose tissue ATGL modifies the cardiac lipid...,"However, the impact of enhanced adipose triacy..."
2,PMC5779697,Adipose tissue ATGL modifies the cardiac lipid...,To investigate the role of adipose tissue lipo...
3,PMC5779697,Adipose tissue ATGL modifies the cardiac lipid...,atATGL-KO mice were subjected to transverse ao...
4,PMC5779697,Adipose tissue ATGL modifies the cardiac lipid...,The cardiac mouse lipidome and the human plasm...
5,PMC5779697,Adipose tissue ATGL modifies the cardiac lipid...,TAC-induced increases in left ventricular mass...
6,PMC5779697,Adipose tissue ATGL modifies the cardiac lipid...,"More importantly, atATGL-KO mice were protecte..."
7,PMC5779697,Adipose tissue ATGL modifies the cardiac lipid...,Perturbation of lipolysis in the adipose tissu...
8,PMC5779697,Adipose tissue ATGL modifies the cardiac lipid...,Profound changes occurred in the lipid class o...
9,PMC5779697,Adipose tissue ATGL modifies the cardiac lipid...,"Moreover, selected heart failure-induced PE sp..."


In [None]:
#save file to csv
sent_df.to_csv("BioHackathon_Lipids.csv", index=False)

In [20]:
def process_in_paragraph(row):
    """
    Processes an article's sections, paragraphs, and figure captions.

    Args:
        row (dict): The row containing article metadata.

    Returns:
        list: A list of tuples containing PMCID, section title, and paragraph text.
    """
    pmcid = row["PMCID"]
    soup = get_xml(pmcid)

    if not soup:
        return []  # Return empty list if no XML content

    try:
        soup = filter_tags(soup)
        title = soup.find("article-title").text if soup.find("article-title") else "Article is missing a title"
        title = clean_text(title)
        rows = []

        # Process abstract if available
        abstract = soup.find("abstract")
        if abstract:
            # Extract all text from <p> tags within <abstract>, ignoring subsections
            abstract_text = " ".join([clean_text(p.get_text(separator=" ").strip()) for p in abstract.find_all("p")])
            rows.append((pmcid, title, "Abstract", abstract_text))

        # Find the main body, excluding the abstract
        body = soup.find("body")

        # Ensure body is found before proceeding
        if body:
            # Process sections and paragraphs in the body only
            for section in body.find_all("sec", recursive=True):
                section_title = section.find("title").get_text(strip=True) if section.find("title") else "Unnamed section"

                # Extract paragraphs
                for paragraph in section.find_all("p", recursive=True):
                    paragraph_text = clean_text(paragraph.get_text(separator=" ").strip())
                    if paragraph_text:
                        rows.append((pmcid, title, section_title, paragraph_text))

                # Handle figure captions
                for figure in section.find_all("fig", recursive=True):
                    caption = figure.find("caption")
                    if caption:
                        caption_text = clean_text(caption.get_text(separator=" ").strip())
                        figure_title = f"{section_title} - Figure Caption"
                        rows.append((pmcid, title, figure_title, caption_text))

        return rows

    except Exception as e:
        print(f"Error processing XML for PMCID {pmcid}: {e}")
        return []


In [21]:
#generate paragraph data
paragraph_df = parallel_process_articles(pmcid_df,process_in_paragraph, processing_type='par')
print(paragraph_df.shape)
paragraph_df.head()

Extracting and processing Articles: 100%|██████████| 48/48 [01:14<00:00,  1.55s/it]

(5147, 4)





Unnamed: 0,PMCID,Title,Section,Paragraph_text
0,PMC5779697,Adipose tissue ATGL modifies the cardiac lipid...,Abstract,Adipose tissue lipolysis occurs during the dev...
1,PMC5779697,Adipose tissue ATGL modifies the cardiac lipid...,Introduction,The development of chronic systolic heart fail...
2,PMC5779697,Adipose tissue ATGL modifies the cardiac lipid...,Introduction,Triacylglycerol (TAG) hydrolysis in adipose ti...
3,PMC5779697,Adipose tissue ATGL modifies the cardiac lipid...,Introduction,Mice carrying a constitutive deletion of ATGL ...
4,PMC5779697,Adipose tissue ATGL modifies the cardiac lipid...,Introduction,"In this study, we report that perturbation of ..."
