In [None]:
with open("pubmed_ids.txt", "r") as file:
    pubmed_ids = file.read().splitlines()
    print(f"Retrieved {len(pubmed_ids)} PubMed IDs")

In [None]:
import requests
def SubmitPMIDList(pmids, Format,batch_size):
    # Ensure pmids is a list of strings
    if not isinstance(pmids, list):
        print("[Error]: 'pmids' must be a list of PubMed IDs.")
        return

    # Prepare the JSON payload
    json_payload = {"pmids": pmids}
    # Set headers
    headers = {"Content-Type": "application/json"}

    # Make the POST request
    url = "https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/export/" + Format
    r = requests.post(url, json=json_payload, headers=headers)
    # Check the response status code
    if r.status_code != 200:
        print("[Error]: HTTP code", r.status_code)
        print("Response:", r.text)  # Print the response body for more details
    else:
        # Save the response to a file
        if(batch_size==10000):
            with open('.\data\steps data output\step-2\1_pubtator_data\pubtator_data_1.json', 'a', encoding="UTF-8") as fileread:
                fileread.write(r.text)  # No need to encode/decode, as r.text is already a string
        elif (batch_size==20000):
            with open('.\data\steps data output\step-2\1_pubtator_data\pubtator_data_2.json', 'a', encoding="UTF-8") as fileread:
                fileread.write(r.text)  # No need to encode/decode, as r.text is already a string
        elif (batch_size==30000):
            with open('.\data\steps data output\step-2\1_pubtator_data\pubtator_data_3.json', 'a', encoding="UTF-8") as fileread:
                fileread.write(r.text)  # No need to encode/decode, as r.text is already a string
        elif (batch_size==40000):
            with open('.\data\steps data output\step-2\1_pubtator_data\pubtator_data_4.json', 'a', encoding="UTF-8") as fileread:
                fileread.write(r.text)  # No need to encode/decode, as r.text is already a string
        elif (batch_size==50000):
            with open('.\data\steps data output\step-2\1_pubtator_data\pubtator_data_5.json', 'a', encoding="UTF-8") as fileread:
                fileread.write(r.text)  # No need to encode/decode, as r.text is already a string
        else:
            with open('.\data\steps data output\step-2\1_pubtator_data\pubtator_data_6.json', 'a', encoding="UTF-8") as fileread:
                fileread.write(r.text)  # No need to encode/decode, as r.text is already a string


In [None]:
def pubtator(pubmed_ids, batch_size=10000):
    import time
    pubtator_raw=[]
    total_ids = len(pubmed_ids)
    # Process IDs in batches
    for start in range(0, total_ids, batch_size):
        end = min(start + batch_size, total_ids)
        batch_ids = pubmed_ids[start:end]
        print(f"Fetching details for IDs {start + 1} to {end} of {total_ids}...")
        try:
            # Fetch records in XML format
            for i in range(start, end, 1000):
                SubmitPMIDList(pubmed_ids[i:i+1000],'biocjson',end)
        except Exception as e:
            print(f"Error fetching details for batch {start + 1}-{end}: {e}")
        
        # Respect NCBI's rate limit (3 requests per second)
        time.sleep(1 / 3)  # Add a small delay between batches

In [None]:
pubtator(pubmed_ids)

In [None]:
import ijson

# List of six PubTator JSON files
file_paths = [
    '.\data\steps data output\step-2\1_pubtator_data\pubtator_data_1.json',
    '.\data\steps data output\step-2\1_pubtator_data\pubtator_data_2.json',
    '.\data\steps data output\step-2\1_pubtator_data\pubtator_data_3.json',
    '.\data\steps data output\step-2\1_pubtator_data\pubtator_data_4.json',
    '.\data\steps data output\step-2\1_pubtator_data\pubtator_data_5.json',
    '.\data\steps data output\step-2\1_pubtator_data\pubtator_data_6.json'
]

# List to store filtered JSON objects (Homo sapiens only)
filtered_pubtator_json = []

# Process each file incrementally
for file_path in file_paths:
    with open(file_path, 'r', encoding='utf-8') as file:
        try:
            # Use 'item' if each file is a JSON array of objects
            parser = ijson.items(file, 'item')
            for obj in parser:
                passages = obj.get('passages', [])
                
                # Ensure there is a second passage with annotations
                if len(passages) > 1 and passages[1].get('annotations'):
                    for annotation in passages[1]['annotations']:
                        infons = annotation.get('infons', {})
                        
                        # Filter for Homo sapiens only
                        if infons.get('identifier') == '9606':
                            filtered_pubtator_json.append(obj)
                            break  # Stop after first match
        except ijson.JSONError as e:
            print(f"Error in file {file_path}: {e}")
    print(f"Processed {file_path}, total filtered objects so far: {len(filtered_pubtator_json)}")

print(f"\nFinal number of Homo sapiens entries: {len(filtered_pubtator_json)}")

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
import re
import json

# Make sure NLTK punkt tokenizer is downloaded
nltk.download('punkt')

def process_article(article):
    """
    Process a single PubTator JSON article: split abstract into sentences and get positions.
    """
    info = article.get('passages')[1]  # Assuming 2nd passage has abstract
    abstract = info.get('text')
    sentences = sent_tokenize(abstract, language="english")
    positions = []
    start = 0

    for sentence in sentences:
        # Escape special regex characters
        escaped_sentence = sentence.translate(str.maketrans({
            "-": r"\-","\\": r"\\","^": r"\^","$": r"\$","*": r"\*",".": r"\.",
            "(": r"\("," )": r"\)", "+": r"\+", "[": r"\[", "]": r"\]", "{": r"\{",
            "}": r"\}","|": r"\|","?": r"\?"
        }))
        match = re.search(escaped_sentence, abstract, flags=re.M | re.I)
        if match:
            positions.append((start + match.start(), start + match.end()))
            abstract = abstract[match.end():]  # Update abstract to avoid overlapping matches
            start += match.end()

    return {
        'PMID': article.get('id'),
        'AB': info.get('text'),
        'SENTENCE': sentences,
        'positions': positions,
        'annotations': info.get('annotations'),
        'offset': info.get('offset')
    }


In [None]:
filtered_pubtator_json_final = []

for article in filtered_pubtator_json:
    processed = process_article(article)
    filtered_pubtator_json_final.append(processed)


In [None]:
len(filtered_pubtator_json_final)

In [None]:
import os

output_dir = '.\data\steps data output\step-2\2_cleaned_pubtator_data'
os.makedirs(output_dir, exist_ok=True)

for idx, file_path in enumerate(file_paths, start=1):
    filtered_articles = []

    # Load & filter Homo sapiens entries
    with open(file_path, 'r', encoding='utf-8') as f:
        parser = ijson.items(f, 'item')
        for obj in parser:
            passages = obj.get('passages', [])
            if len(passages) > 1 and passages[1].get('annotations'):
                for annotation in passages[1]['annotations']:
                    infons = annotation.get('infons', {})
                    if infons.get('identifier') == '9606':
                        filtered_articles.append(obj)
                        break

    # Process sentences & positions
    cleaned_articles = [process_article(article) for article in filtered_articles]

    # Save to JSON
    output_file = os.path.join(output_dir, f'pubtator_data_{idx}_cleaned.json')
    with open(output_file, 'w', encoding='utf-8') as out_f:
        json.dump(cleaned_articles, out_f, indent=4)

    print(f"Processed and saved {len(cleaned_articles)} articles to {output_file}")


In [None]:
import ijson
file_paths = ['.\data\steps data output\step-2\2_cleaned_pubtator_data\pubtator_data_1_cleaned.json',
              '.\data\steps data output\step-2\2_cleaned_pubtator_data\pubtator_data_2_cleaned.json',
              '.\data\steps data output\step-2\2_cleaned_pubtator_data\pubtator_data_3_cleaned.json',
              '.\data\steps data output\step-2\2_cleaned_pubtator_data\pubtator_data_4_cleaned.json',
              '.\data\steps data output\step-2\2_cleaned_pubtator_data\pubtator_data_5_cleaned.json',
              '.\data\steps data output\step-2\2_cleaned_pubtator_data\pubtator_data_6_cleaned.json']
# Initialize an empty list to store JSON objects
all_articles_details = []
# Process the file incrementally
for i in file_paths:
    with open(i, 'r', encoding='utf-8') as file:
        try:
            # Use the correct prefix for your JSON structure
            # For a JSON array, use an empty string ('') or 'item'
            parser = ijson.items(file, 'item')  # Adjust the prefix if needed
            for item in parser:
                all_articles_details.append(item)  # Add each JSON object to the list
        except ijson.JSONError as e:
            print(item)
            print(f"Skipping error: {e}")
    print(len(all_articles_details))

4270
9789
16277
22872
27932
28686


In [None]:
articles_list=[]
error=[]
error2=[]
for i in all_articles_details:
    for j in i.get('annotations'):
        for k,l in enumerate(i.get('positions')):
            if j.get('locations')[0].get('offset')>=l[0]+i.get('offset') and j.get('locations')[0].get('offset')<l[1]+i.get('offset'):
                articles_list.append({'id':i.get('PMID'),'AB':i.get('AB'), 'type': j.get('infons').get('type'), \
                                     'identifier':j.get('infons').get('identifier'),'text':j.get('text'),'sentence':i.get('SENTENCE')[k],\
                                     'sentenceSize':len(i.get('SENTENCE')),'sentenceIndex':k})
                break
        else:
            error.append(i)
            error2.append(j)

In [6]:
distinct_types = set()
for i in articles_list:
    distinct_types.add(i.get('type'))
print(distinct_types)

{'Species', 'Disease', 'Chromosome', 'Gene', 'Variant', 'Chemical', 'CellLine', 'RefSeq'}


In [7]:
Disease_pubtator_addpmid={}
for i in articles_list:
    if i.get('type')=='Disease':
        if Disease_pubtator_addpmid.get(i.get('id')):
            Disease_pubtator_addpmid[i.get('id')]=Disease_pubtator_addpmid.get(i.get('id'))+[i]
        else:
            Disease_pubtator_addpmid[i.get('id')]=[i]

In [8]:
Disease_output_json=[]
for i,j in Disease_pubtator_addpmid.items():
    Disease_output_json+=j

In [9]:
len(Disease_output_json)

266278

In [10]:
for i in Disease_output_json:
    i['PMID']=i.get('id')
    i['word']=i.get('text')
    del i['id']
    del i['text']

In [11]:
Disease_output_json[0]

{'AB': 'Anterior segment ischemia changes can occur without detachment of any muscles. The most common cause of such ischemic changes of the anterior segment is the removal of too many rectus muscles in one operation. Twenty dog eyes and eight monkey eyes were subjected to the disinsertion and detachment of various combinations of extraocular muscles. They were sacrificed at intervals from 30 to 90 days. During the observation period, they were observed for gross and slit lamp changes. The enucleated eyes were studied microscopically for signs of ischemic and necrotic changes. Two patients who were studied, observed, and treated for anterior segment ischemia following muscle surgery are described. The changes which occur after muscle surgery are extensive and include corneal edema, cataract, chemosis, corneal changes, decreases in intraocular pressure, decreases in outflow or glaucoma and frank necrosis. The variables which lead to this reaction is described in detail. Also, some unans

In [12]:
import re
Disease_deduplicated=set()
Disease_deduplicated_count=0
disease_list_raw=[]
for i in Disease_output_json:
    k=i.get('PMID')
    if k:
        i['AB']=i.get('AB')
        i['URL']='https://pubmed.ncbi.nlm.nih.gov/'+k
        i['target']=i.get('word')
        i['entity']=i.get('word')
        i['type']='Disease'
        if i.get('identifier') and re.match(r'MESH:',i.get('identifier')):
            i['MESH']=i.get('identifier')[5:]
        else:
            i['MESH']=''
        tuples=(i.get("PMID"),i.get("word"),i.get("sentence"))
        if tuples not in Disease_deduplicated:
            Disease_deduplicated.add(tuples)
            disease_list_raw.append(i)
        else:
            Disease_deduplicated_count+=1
            continue

In [13]:
len(disease_list_raw)

249837

In [14]:
disease_list_raw[0]

{'AB': 'Anterior segment ischemia changes can occur without detachment of any muscles. The most common cause of such ischemic changes of the anterior segment is the removal of too many rectus muscles in one operation. Twenty dog eyes and eight monkey eyes were subjected to the disinsertion and detachment of various combinations of extraocular muscles. They were sacrificed at intervals from 30 to 90 days. During the observation period, they were observed for gross and slit lamp changes. The enucleated eyes were studied microscopically for signs of ischemic and necrotic changes. Two patients who were studied, observed, and treated for anterior segment ischemia following muscle surgery are described. The changes which occur after muscle surgery are extensive and include corneal edema, cataract, chemosis, corneal changes, decreases in intraocular pressure, decreases in outflow or glaucoma and frank necrosis. The variables which lead to this reaction is described in detail. Also, some unans

In [None]:
import json
with open('.\data\steps data output\step-2\3_biological_entities\Disease_Extraction.json', 'w') as fileread:
        json.dump(disease_list_raw, fileread, indent=4)

In [41]:
distinct_disease = set()
for i in disease_list_raw:
    if(i.get('sentence')):
        distinct_disease.add(i.get('sentence'))
print(len(distinct_disease))

151723


In [35]:
pubtator_gene=[]
for i in articles_list:
    if i.get('type')=='Gene':
        pubtator_gene.append(i)

In [37]:
for i in pubtator_gene:
    i['PMID']=i.get('id')
    i['word']=i.get('text')
    del i['id']
    del i['text']

In [38]:
import re
Gene_deduplicated=set()
Gene_deduplicated_count=0
Gene_list_raw=[]
for i in pubtator_gene:
    k=i.get('PMID')
    if k:
        i['AB']=i.get('AB')
        i['URL']='https://pubmed.ncbi.nlm.nih.gov/'+k
        i['target']=i.get('word')
        i['entity']=i.get('word')
        i['type']='Gene'
        if i.get('identifier'):
            i['identifier']=i.get('identifier')
        else:
            i['identifier']=''
        tuples=(i.get("PMID"),i.get("identifier"),i.get("word"),i.get("sentence"))
        if tuples not in Disease_deduplicated:
            Gene_deduplicated.add(tuples)
            Gene_list_raw.append(i)
        else:
            Gene_deduplicated_count+=1
            continue

In [45]:
Gene_list_raw[0]

{'AB': 'An RNA recognition motif (RRM) of approximately 80 amino acids constitutes the core of RNA-binding domains found in a large family of proteins involved in RNA processing. The U1 RNA-binding domain of the A protein component of the human U1 small nuclear ribonucleoprotein (RNP), which encompasses the RRM sequence, was analyzed by using NMR spectroscopy. The domain of the A protein is a highly stable monomer in solution consisting of four antiparallel beta-strands and two alpha-helices. The highly conserved RNP1 and RNP2 consensus sequences, containing residues previously suggested to be involved in nucleic acid binding, are juxtaposed in adjacent beta-strands. Conserved aromatic side chains that are critical for RNA binding are clustered on the surface of the molecule adjacent to a variable loop that influences recognition of specific RNA sequences. The secondary structure and topology of the RRM are similar to those of ribosomal proteins L12 and L30, suggesting a distant evolut

In [43]:
distinct_gene = set()
for i in Gene_list_raw:
    if i.get('identifier'):
        distinct_gene.add(i.get('identifier').lower())
print(len(distinct_gene))

2773


In [46]:
distinct_gene = set()
for i in Gene_list_raw:
    if i.get('sentence'):
        distinct_gene.add(i.get('sentence').lower())
print(len(distinct_gene))

9304


In [47]:
distinct_gene = set()
for i in Gene_list_raw:
    if i.get('word'):
        distinct_gene.add(i.get('word').lower())
print(len(distinct_gene))

3781


In [None]:
import json
with open('.\data\steps data output\step-2\3_biological_entities\Gene_Extraction.json', 'w') as fileread:
        json.dump(Gene_list_raw, fileread, indent=4)

In [48]:
pubtator_chromosome=[]
for i in articles_list:
    if i.get('type')=='Chromosome':
        pubtator_chromosome.append(i)

In [50]:
for i in pubtator_chromosome:
    i['PMID']=i.get('id')
    i['word']=i.get('text')
    del i['id']
    del i['text']

In [51]:
import re
Chromosome_deduplicated=set()
Chromosome_deduplicated_count=0
Chromosome_list_raw=[]
for i in pubtator_chromosome:
    k=i.get('PMID')
    if k:
        i['AB']=i.get('AB')
        i['URL']='https://pubmed.ncbi.nlm.nih.gov/'+k
        i['target']=i.get('word')
        i['entity']=i.get('word')
        i['type']='Chromosome'
        if i.get('identifier'):
            i['identifier']=i.get('identifier')
        else:
            i['identifier']=''
        tuples=(i.get("PMID"),i.get("identifier"),i.get("word"),i.get("sentence"))
        if tuples not in Chromosome_deduplicated:
            Chromosome_deduplicated.add(tuples)
            Chromosome_list_raw.append(i)
        else:
            Chromosome_deduplicated_count+=1
            continue

In [None]:
with open('.\data\steps data output\step-2\3_biological_entities\Chromosome_Extraction.json', 'w', encoding="UTF-8") as fileread:
    json.dump(Chromosome_list_raw, fileread, indent=4)  # Convert list to JSON and write to file

In [None]:
for i in pubtator_cellline:
    i['PMID']=i.get('id')
    i['word']=i.get('text')
    del i['id']
    del i['text']

In [None]:
import re
CellLine_deduplicated=set()
CellLine_deduplicated_count=0
CellLine_list_raw=[]
for i in pubtator_cellline:
    k=i.get('PMID')
    if k:
        i['AB']=i.get('AB')
        i['URL']='https://pubmed.ncbi.nlm.nih.gov/'+k
        i['target']=i.get('word')
        i['entity']=i.get('word')
        i['type']='CellLine'
        if i.get('identifier'):
            i['identifier']=i.get('identifier')
        else:
            i['identifier']=''
        tuples=(i.get("PMID"),i.get("identifier"),i.get("word"),i.get("sentence"))
        if tuples not in CellLine_deduplicated:
            CellLine_deduplicated.add(tuples)
            CellLine_list_raw.append(i)
        else:
            CellLine_deduplicated_count+=1
            continue

In [None]:
distinct_cellline = set()
for i in CellLine_list_raw:
    distinct_cellline.add(i.get('identifier'))
print(len(distinct_cellline))

In [None]:
with open('.\data\steps data output\step-2\3_biological_entities\CellLine_Extraction.json', 'w', encoding="UTF-8") as fileread:
    json.dump(CellLine_list_raw, fileread, indent=4)  # Convert list to JSON and write to file

In [55]:
pubtator_variant=[]
for i in articles_list:
    if i.get('type')=='Variant':
        pubtator_variant.append(i)

In [57]:
for i in pubtator_variant:
    i['PMID']=i.get('id')
    i['word']=i.get('text')
    del i['id']
    del i['text']

In [58]:
import re
Variant_deduplicated=set()
Variant_deduplicated_count=0
Variant_list_raw=[]
for i in pubtator_variant:
    k=i.get('PMID')
    if k:
        i['AB']=i.get('AB')
        i['URL']='https://pubmed.ncbi.nlm.nih.gov/'+k
        i['target']=i.get('word')
        i['entity']=i.get('word')
        i['type']='Variant'
        if i.get('identifier'):
            i['identifier']=i.get('identifier')
        else:
            i['identifier']=''
        tuples=(i.get("PMID"),i.get("identifier"),i.get("word"),i.get("sentence"))
        if tuples not in Variant_deduplicated:
            Variant_deduplicated.add(tuples)
            Variant_list_raw.append(i)
        else:
            Variant_deduplicated_count+=1
            continue

In [None]:
with open('.\data\steps data output\step-2\3_biological_entities\Variant_Extraction.json', 'w', encoding="UTF-8") as fileread:
    json.dump(Variant_list_raw, fileread, indent=4)  # Convert list to JSON and write to file

In [None]:
pubtator_RefSeq=[]
for i in articles_list:
    if i.get('type')=='RefSeq':
        pubtator_RefSeq.append(i)

In [None]:
len(pubtator_RefSeq)

In [None]:
for i in pubtator_RefSeq:
    i['PMID']=i.get('id')
    i['word']=i.get('text')
    del i['id']
    del i['text']

In [None]:
import re
RefSeq_deduplicated=set()
RefSeq_deduplicated_count=0
RefSeq_list_raw=[]
for i in pubtator_RefSeq:
    k=i.get('PMID')
    if k:
        i['AB']=i.get('AB')
        i['URL']='https://pubmed.ncbi.nlm.nih.gov/'+k
        i['target']=i.get('word')
        i['entity']=i.get('word')
        i['type']='RefSeq'
        if i.get('identifier'):
            i['identifier']=i.get('identifier')
        else:
            i['identifier']=''
        tuples=(i.get("PMID"),i.get("identifier"),i.get("word"),i.get("sentence"))
        if tuples not in Variant_deduplicated:
            RefSeq_deduplicated.add(tuples)
            RefSeq_list_raw.append(i)
        else:
            RefSeq_deduplicated_count+=1
            continue

In [None]:
distinct_RefSeq = set()
for i in RefSeq_list_raw:
    distinct_RefSeq.add(i.get('identifier'))
print(len(distinct_RefSeq))

In [None]:
with open('.\data\steps data output\step-2\3_biological_entities\RefSeq_Extraction.json', 'w', encoding="UTF-8") as fileread:
    json.dump(RefSeq_list_raw, fileread, indent=4)  # Convert list to JSON and write to file

In [63]:
pubtator_Chemical=[]
for i in articles_list:
    if i.get('type')=='Chemical':
        pubtator_Chemical.append(i)            

In [None]:
for i in pubtator_Chemical:
    identifier = i.get('identifier', '')  # Get the identifier, default to empty string if missing
    parts = identifier.split(':')  # Split by colon
    # Check if the identifier has at least two parts
    if len(parts) > 1:
        chemical = parts[1]  # Get the part after the colon
        i['type_updated'] = category_mesh(chemical)  # Update the type
    else:
        i['type_updated'] = 'Unknown'

In [66]:
for i in pubtator_Chemical:
    i['PMID']=i.get('id')
    i['word']=i.get('text')
    del i['id']
    del i['text']

In [67]:
import re
Chemical_deduplicated=set()
Chemical_deduplicated_count=0
Chemical_list_raw=[]
for i in pubtator_Chemical:
    k=i.get('PMID')
    if k:
        i['AB']=i.get('AB')
        i['URL']='https://pubmed.ncbi.nlm.nih.gov/'+k
        i['target']=i.get('word')
        i['entity']=i.get('word')
        i['type']='Chemical'
        if i.get('identifier'):
            i['identifier']=i.get('identifier')
        else:
            i['identifier']=''
        tuples=(i.get("PMID"),i.get("identifier"),i.get("word"),i.get("sentence"))
        if tuples not in Chemical_deduplicated:
            print(i)
            Chemical_deduplicated.add(tuples)
            Chemical_list_raw.append(i)
        else:
            Chemical_deduplicated_count+=1
            continue

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
import json
with open('.\data\steps data output\step-2\3_biological_entities\Chemical_Extraction.json', 'w') as fileread:
        json.dump(pubtator_Chemical, fileread, indent=4)