In [None]:
import re
import pandas as pd

def extract_info(entry):
    # Extract PMID
    pmid_match = re.search(r'PMID- (\d+)', entry)
    pmid = pmid_match.group(1) if pmid_match else ""

    # Extract Title
    title_match = re.search(r'TI  - (.+)', entry)
    title = title_match.group(1).replace('\n', ' ').strip() if title_match else ""

    # Extract Abstract and exclude unwanted sections
    abstract_match = re.search(r'AB  - (.+?)(?:CI|FAU|AU|AD|LA|PT|PL|TA|JT|RN|SB|MH|PMC|OTO|COIS|EDAT|MHDA|PMCR|CRDT|PHST|AID|PST|SO) - ', entry, re.DOTALL)
    abstract = abstract_match.group(1).replace('\n', ' ').strip() if abstract_match else ""

    return pmid, title, abstract

# Read the data from the text file
with open('pubmed-piRNA-set.txt', 'r') as file:
    data = file.read()

# Split the data into individual entries using double newlines as the delimiter
entries = data.split('\n\n')

# Extract the information for each entry
extracted_data = [extract_info(entry) for entry in entries if 'PMID-' in entry]

# Convert to DataFrame
df = pd.DataFrame(extracted_data, columns=['PMID', 'Title', 'Abstract'])

# Print the DataFrame to the console
print(df)

# Save the DataFrame to a CSV file
output_filename = 'extracted_data.csv'
try:
    df.to_csv(output_filename, index=False)
    print(f"Data extracted and saved to {output_filename}")
except Exception as e:
    print(f"An error occurred while saving the file: {e}")


          PMID                                              Title  \
0     31399034  The emerging role of the piRNA/piwi complex in...   
1     37758951  piRNA processing by a trimeric Schlafen-domain...   
2     35182819         Structural insights into piRNA biogenesis.   
3     37407865  PiRNA in Cardiovascular Disease: Focus on Card...   
4     28964526       piRNA Biogenesis in Drosophila melanogaster.   
...        ...                                                ...   
2942  37422840  High interindividual variability in LDL-choles...   
2943  38939389  Rivaroxaban vs Vitamin K Antagonist in Patient...   
2944  37062040  A worldwide survey on incidence, management, a...   
2945  37068915  Characteristics and outcomes of SARS-CoV-2 bre...   
2946  38194227  Comparator Data Characteristics and Testing Pr...   

                                               Abstract  
0     Piwi interacting RNAs (piRNAs) constitute nove...  
1     Transposable elements are genomic parasites th...

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_md-0.5.0.tar.gz


Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_md-0.5.0.tar.gz
  Downloading https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_md-0.5.0.tar.gz (120.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.2/120.2 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting spacy<3.3.0,>=3.2.3 (from en_core_sci_md==0.5.0)
  Downloading spacy-3.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (23 kB)
Collecting thinc<8.1.0,>=8.0.12 (from spacy<3.3.0,>=3.2.3->en_core_sci_md==0.5.0)
  Downloading thinc-8.0.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting wasabi<1.1.0,>=0.8.1 (from spacy<3.3.0,>=3.2.3->en_core_sci_md==0.5.0)
  Downloading wasabi-0.10.1-py3-none-any.whl.metadata (28 kB)
Collecting typer<0.5.0,>=0.3.0 (from spacy<3.3.0,>=3.2.3->en_core_sci_md==0.5.0)
  Downloading typer-0

In [None]:
!pip install scispacy

Collecting scispacy
  Downloading scispacy-0.5.4-py3-none-any.whl.metadata (16 kB)
Collecting spacy<3.8.0,>=3.7.0 (from scispacy)
  Using cached spacy-3.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting scipy<1.11 (from scispacy)
  Downloading scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.9/58.9 kB[0m [31m844.3 kB/s[0m eta [36m0:00:00[0m
Collecting conllu (from scispacy)
  Downloading conllu-5.0.1-py3-none-any.whl.metadata (21 kB)
Collecting nmslib>=1.7.3.6 (from scispacy)
  Downloading nmslib-2.1.1.tar.gz (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pysbd (from scispacy)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting pybind11<2.6.2 (from nmslib>=1.7.3.6-

In [None]:
!pip install -U spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
print(spacy.__version__)

# Load the model to test
try:
    nlp = spacy.load("en_core_web_sm")
    print("Model loaded successfully!")
except Exception as e:
    print(f"An error occurred: {e}")

3.7.6
Model loaded successfully!


In [None]:
import re
import pandas as pd

# Improved regular expression for piRNA molecules (includes singular and plural forms, case-insensitive)
piRNA_pattern = r'\b(?:piRNA-?\d+|piR-?\d+|PIWI-interacting RNA[s]?|piRNAs|pirnas?)\b'

# Extended list of common diseases and related terms
disease_terms = [
    "cancer", "disease", "cardiovascular diseases", "CVD", "heart disease",
    "diabetes", "hypertension", "stroke", "Alzheimer's", "Parkinson's",
    "renal cell carcinoma", "RCC", "chronic obstructive pulmonary disease", "COPD",
    "breast cancer", "lung cancer", "colorectal cancer", "leukemia", "lymphoma",
    "melanoma", "prostate cancer", "liver cancer", "pancreatic cancer",
    "ovarian cancer", "brain cancer", "endometrial cancer", "kidney disease",
    "liver disease", "lupus", "multiple sclerosis", "ALS", "amyotrophic lateral sclerosis",
    "asthma", "epilepsy", "depression", "schizophrenia", "bipolar disorder",
    "HIV", "AIDS", "tuberculosis", "pneumonia", "COVID-19", "Zika virus",
    "Ebola", "dengue", "malaria", "hepatitis", "cirrhosis", "fibrosis"
]

# Function to annotate text with <piRNA> and <disease> tags
def annotate_text(text):
    if pd.isna(text):
        return text

    # Step 1: Annotate piRNA molecules (including plural forms), avoiding double tagging
    text = re.sub(r'(<piRNA>.*?</piRNA>)', r'\1', text)  # Skip already tagged
    text = re.sub(piRNA_pattern + r'(?![^<]*</piRNA>)', r'<piRNA>\g<0></piRNA>', text, flags=re.IGNORECASE)

    # Step 2: Annotate diseases, ensuring no double tagging
    for disease in disease_terms:
        text = re.sub(r'(<disease>.*?</disease>)', r'\1', text)  # Skip already tagged
        text = re.sub(r'\b{}\b(?![^<]*</disease>)'.format(re.escape(disease)), r'<disease>\g<0></disease>', text, flags=re.IGNORECASE)

    return text

# Function to process the dataset and apply annotations
def process_dataset(input_filepath, output_filepath):
    # Load the dataset
    data = pd.read_csv(input_filepath)

    # Create a new column for annotated abstracts
    data['Annotated_Abstract'] = data['Abstract'].apply(annotate_text)

    # Save the results to a new CSV file
    data.to_csv(output_filepath, index=False)
    print(f"Annotated data saved to {output_filepath}")

    # Print a sample of processed abstracts
    print("\nSample of processed abstracts:")
    for _, row in data.head().iterrows():
        print(f"PMID: {row['PMID']}")
        print(f"Annotated Abstract: {row['Annotated_Abstract'][:500]}...")  # Print first 500 characters
        print("-" * 50)

# Example usage
input_file = 'extracted_data.csv'
output_file = 'annotated_data.csv'
process_dataset(input_file, output_file)


Annotated data saved to annotated_data.csv

Sample of processed abstracts:
PMID: 31399034
Annotated Abstract: Piwi interacting RNAs (<piRNA>piRNAs</piRNA>) constitute novel small non-coding RNA molecules of        approximately 24-31 nucleotides in length that often bind to members of the piwi        protein family to play regulatory roles. Recently, emerging evidence suggests        that in addition to the mammalian germline, <piRNA>piRNAs</piRNA> are also expressed in a        tissue-specific manner in a variety of human tissues and modulate key signaling        pathways at the transcriptional or post...
--------------------------------------------------
PMID: 37758951
Annotated Abstract: Transposable elements are genomic parasites that expand within and spread between        genomes(1). PIWI proteins control transposon activity, notably in the        germline(2,3). These proteins recognize their targets through small RNA        co-factors named <piRNA>PIWI-interacting RNAs</piRNA> (