The purpose of this Jupyter Notebook belongs to the ETL collection of notebooks from D3N which serves the purpose of extracting abstracts and splitting them to groups of sentences from raw-unstructured .txt file. 

# Imports

In [1]:
import logging
import sys

import pandas as pd
import numpy as np
import random
import re

import nltk
nltk.download('punkt')

import spacy
from tqdm import tqdm
import os

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Setting-Up Pipeline

In [3]:
def setup_logging(log_level=logging.INFO, log_file='app.log'):
    """Setup logging configuration.

    Args:
        log_level (int): Logging level (default is logging.INFO).
        log_file (str): Log file name (default is 'app.log').
    """
    # Create a custom logger
    logger = logging.getLogger()
    logger.setLevel(log_level)

    # Create handlers
    c_handler = logging.StreamHandler(sys.stdout)
    f_handler = logging.FileHandler(log_file)
    c_handler.setLevel(log_level)
    f_handler.setLevel(log_level)

    # Create formatters and add them to the handlers
    c_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    c_handler.setFormatter(c_format)
    f_handler.setFormatter(f_format)

    # Add handlers to the logger
    logger.addHandler(c_handler)
    logger.addHandler(f_handler)

def log_variable(name, value):
    """Log the value of a variable.

    Args:
        name (str): Name of the variable.
        value (any): Value of the variable.
    """
    logging.info(f'{name}: {value}')

setup_logging()

logging.info("Program started")

2025-02-22 16:05:08,893 - root - INFO - Program started


### Initializing Spacy Language Model

In [None]:
try:
    nlp = spacy.load("en_core_sci_sm") 
    print("Model loaded successfully")
except Exception as e:
    print(f"Error loading the model: {e}")  
    
import torch

if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    print(f"Num of GPUs available: {num_gpus}")
    torch.cuda.set_device(0)
else:
    print("Cuda not available")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


Model loaded successfully
Num of GPUs available: 1


## Helper Funtions

In [5]:
def extract_pubmed_data(file_path):
    """
    Explores the .txt data to create a list of dictionaries where each dictionary 
    represents an individual abstract. (Suitable for Pubmed Abstract Extracts)
    param: file_path: String
    returns: extracted_data: List[Dict[str,str]]
    """
    extracted_data = []
    with open(file_path, 'r') as file:
        article = {}
        current_field = None
        for line in file:
            line = line.rstrip()
            if line.startswith('PMID- '):
                if article:
                    extracted_data.append(article)
                article = {'PMID': line.split('PMID- ')[1].strip()}
                current_field = None
            elif line.startswith('TI  - '):
                article['Title'] = line.split('TI  - ')[1].strip()
                current_field = 'Title'
            elif line.startswith('AB  - '):
                article['Abstract'] = line.split('AB  - ')[1].strip()
                current_field = 'Abstract'
            elif line.startswith('DP  - '):
                article['Date of Publication'] = line.split('DP  - ')[1].strip()
                current_field = None
            elif line.startswith('MH  - '):
                if 'MeSH' not in article:
                    article['MeSH'] = []
                article['MeSH'].append(line.split('MH  - ')[1].strip())
                current_field = 'MeSH'
            elif current_field == 'Abstract' and line.startswith('      '):
                article['Abstract'] += ' ' + line.strip()
            elif current_field == 'Title' and line.startswith('      '):
                article['Title'] += ' ' + line.strip()
        if article:
            extracted_data.append(article)
    return extracted_data

def segmenter(text, pmid, i=0):
    """Returns a list of segmented sentences, and a specific dataframe.

    params: text (str): An abstract.
            pmid (int): The id of the abstract.
            i (int) : An iterable with a default value of 0.
    Returns: data (dict): {'pmid', 'sentence_no','sentence'}
             sentences (list)
    """
    doc = nlp(text)
    sentences = [s.text.strip() for s in doc.sents]
    data = []
    for sentence in sentences:
        sent_data = {'pmid':pmid, 'sent_no':i, 'sentence':sentence}
        data.append(sent_data) 
        i += 1
    return sentences, data

def segment_sentences(samples: pd.DataFrame, output_path: str):
    sentences = []
    samples["sentences_list"] = None
    samples = samples.dropna(subset=['Abstract'])
    samples.reset_index(drop=True, inplace=True)
    for idx, row in tqdm(samples.iterrows(), total=len(samples), desc="Segmenting Sentences"):
        abstract = row['Abstract']
        segmented_sentences,data = segmenter(text = abstract,pmid=row["PMID"])
        for s in data:
            sentences.append(s)
        samples.at[idx, 'sentences_list'] = segmented_sentences
        
    
    subset_df = pd.DataFrame(sentences)
    samples.to_csv(f"{output_path}unsegmented_unfiltered.csv",index=False)
    subset_df = subset_df.reset_index()
    subset_df.to_csv(f"{output_path}segmented_unfiltered.csv",index=False)
    return subset_df

# Implementation

In [6]:
# Create List of Dictionaries
file_path = "D:/CSE498R_Resources/D3N/Dengue-Drug-Discovery-Network-D3N/Data/input/pubmed-DENV-10k-abstracts.txt"
output_path = "D:/CSE498R_Resources/D3N/Dengue-Drug-Discovery-Network-D3N/Data/output/unfiltered/"

if not os.path.exists(output_path):
    os.makedirs(output_path)
data = extract_pubmed_data(file_path)

# Convert List of dicts to a dataframe
df = pd.DataFrame(data)
print(f"Shape of Dataframe: {df.shape}")
df.head()

Shape of Dataframe: (10000, 5)


Unnamed: 0,PMID,Date of Publication,Title,Abstract,MeSH
0,19822889,2009 Oct,Dengue virus pathogenesis: an integrated view.,Much remains to be learned about the pathogene...,"[Dengue/pathology/*virology, Dengue Virus/path..."
1,30301880,2018 Oct 10,Dengue virus and the host innate immune response.,Dengue virus (DENV) is a mosquito-borne Flaviv...,"[Animals, Antibodies, Viral/immunology, Dengue..."
2,34696397,2021 Sep 30,Dengue Virus Infection: A Tale of Viral Exploi...,Dengue is a mosquito-borne viral disease (arbo...,"[Antibodies, Viral/immunology, Dengue/*metabol..."
3,32751561,2020 Jul 30,Dengue: A Minireview.,"Dengue, caused by infection of any of four den...","[Aedes/virology, Animals, *Dengue/epidemiology..."
4,27213782,2016,Meta-Analysis of Dengue Severity during Infect...,INTRODUCTION: Dengue virus (DENV) infection is...,"[Dengue/*virology, Dengue Virus/*classificatio..."


In [None]:
# Creating a sample of first 1000 rows from the dataframe
sample = df
print(f"Shape of Dataframe: {sample.shape}")
random_row = sample.iloc[random.randint(0, len(sample) - 1)]
print(f"Random Row: \n{random_row}")

Shape of Dataframe: (10000, 5)
Random Row: PMID                                                            18705473
Date of Publication                                             2007 Dec
Title                  Predictive value of thrombocytopaenia in the d...
Abstract               Thrombocytopaenia is often relied upon as an i...
MeSH                   [Adult, Blood Pressure Monitoring, Ambulatory,...
Name: 6482, dtype: object


In [8]:
sample.head()

Unnamed: 0,PMID,Date of Publication,Title,Abstract,MeSH
0,19822889,2009 Oct,Dengue virus pathogenesis: an integrated view.,Much remains to be learned about the pathogene...,"[Dengue/pathology/*virology, Dengue Virus/path..."
1,30301880,2018 Oct 10,Dengue virus and the host innate immune response.,Dengue virus (DENV) is a mosquito-borne Flaviv...,"[Animals, Antibodies, Viral/immunology, Dengue..."
2,34696397,2021 Sep 30,Dengue Virus Infection: A Tale of Viral Exploi...,Dengue is a mosquito-borne viral disease (arbo...,"[Antibodies, Viral/immunology, Dengue/*metabol..."
3,32751561,2020 Jul 30,Dengue: A Minireview.,"Dengue, caused by infection of any of four den...","[Aedes/virology, Animals, *Dengue/epidemiology..."
4,27213782,2016,Meta-Analysis of Dengue Severity during Infect...,INTRODUCTION: Dengue virus (DENV) infection is...,"[Dengue/*virology, Dengue Virus/*classificatio..."


In [None]:
subset_df = segment_sentences(samples=sample,output_path=output_path)

Segmenting Sentences:  88%|████████▊ | 8426/9559 [05:51<00:50, 22.55it/s]