# SARS CoV 2 PubMed Metadata Extraction

- https://www.december.com/html/spec/httpstat.html
- https://erilu.github.io/pubmed-abstract-compiler/
- https://www.ncbi.nlm.nih.gov/books/NBK25500/
- https://www.ncbi.nlm.nih.gov/books/NBK25499/
- https://www.nlm.nih.gov/bsd/mms/medlineelements.html

In [None]:
# Installing Biopython
! pip install -q biopython

[K     |████████████████████████████████| 2.3 MB 5.9 MB/s 
[?25h

In [None]:
import requests
import urllib
from http.client import IncompleteRead

from Bio import Entrez
from Bio import Medline

import re
import time

from pprint import pprint
from tqdm import tqdm

import pandas as pd
import numpy as np

In [None]:
search_start = time.time()

In [None]:
######## ESearch ########
query_term = 'covid%19+OR+SARS%Cov%2'

base_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'

esearch_db = 'db=pubmed'

esearch_eutil = 'esearch.fcgi?'

esearch_query = '&term=' + query_term

esearch_history = '&usehistory=y'

esearch_rettype = '&rettype=json'

esearch_url = base_url + esearch_eutil + esearch_db + esearch_query + esearch_history + esearch_rettype

url_result = urllib.request.urlopen(esearch_url)

esearch_url_result = url_result.read().decode('utf-8')

total_pmids = int(re.findall("<Count>(\d+?)</Count>", esearch_url_result)[0])

In [None]:
esearch_url

'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=covid%19+OR+SARS%Cov%2&usehistory=y&rettype=json'

In [None]:
total_pmids

214668

In [None]:
######## EFetch ########

efetch_webenv = "&WebEnv=" + re.findall ("<WebEnv>(\S+)<\/WebEnv>", esearch_url_result)[0]

efetch_querykey = "&query_key=" + re.findall("<QueryKey>(\d+?)</QueryKey>", esearch_url_result)[0]

In [None]:
retmax = 10000
retstart = 0
all_pmids = list()
run = True

while run:
  efetch_eutil = 'efetch.fcgi?'
  efetch_retstart = "&retstart=" + str(retstart)
  efetch_retmax = "&retmax=" + str(retmax)
  efetch_retmode = "&retmode=text"
  efetch_rettype = "&rettype=uilist"

  efetch_url = base_url + efetch_eutil + esearch_db + efetch_querykey + efetch_webenv + efetch_retstart + efetch_retmax + efetch_retmode + efetch_rettype
  efetch_url_result = urllib.request.urlopen(efetch_url)
  efetch_data = efetch_url_result.read().decode('utf-8')

  all_pmids.extend(efetch_data.splitlines())

  print("a total of " + str(len(all_pmids)) + " PMIDs have been downloaded.\n")

  time.sleep(2)
  retstart = retstart + retmax

  if retstart >= total_pmids:
    run = False

a total of 10000 PMIDs have been downloaded.

a total of 20000 PMIDs have been downloaded.

a total of 30000 PMIDs have been downloaded.

a total of 40000 PMIDs have been downloaded.

a total of 50000 PMIDs have been downloaded.

a total of 60000 PMIDs have been downloaded.

a total of 70000 PMIDs have been downloaded.

a total of 80000 PMIDs have been downloaded.

a total of 90000 PMIDs have been downloaded.

a total of 100000 PMIDs have been downloaded.

a total of 110000 PMIDs have been downloaded.

a total of 120000 PMIDs have been downloaded.

a total of 130000 PMIDs have been downloaded.

a total of 140000 PMIDs have been downloaded.

a total of 150000 PMIDs have been downloaded.

a total of 160000 PMIDs have been downloaded.

a total of 170000 PMIDs have been downloaded.

a total of 180000 PMIDs have been downloaded.

a total of 190000 PMIDs have been downloaded.

a total of 200000 PMIDs have been downloaded.

a total of 210000 PMIDs have been downloaded.

a total of 214668 PMID

In [None]:
search_end = time.time() 
print(f"Esearch Took:\t{(search_end-search_start)/60}")

Esearch Took:	1.2240100423494975


In [None]:
fetch_start = time.time()

In [None]:
all_data = list()

for i in tqdm(range(0, len(all_pmids), 10000)):

  batch_pmids = all_pmids[i:i+10000]

  try:
    Entrez.email = 'akishirsath@gmail.com'
    handle = Entrez.efetch(db="pubmed", 
                        sort='best', 
                        id=",".join(batch_pmids), 
                        rettype="medline", 
                        retmode="text")

    records = Medline.parse(handle)
    all_data.append(list(records))
    handle.close()
    time.sleep(2)
  except IncompleteRead:
    pass

100%|██████████| 22/22 [1:02:04<00:00, 169.30s/it]


In [None]:
fetch_end = time.time()
print(f"Efetch Took:\t{(fetch_end-fetch_start)/60}")

Efetch Took:	62.07600411574046


In [None]:
total_pmids - len(all_data)

214647

In [None]:
print(round(fetch_end - fetch_start)/60)

62.083333333333336


In [None]:
len(all_data[0])

10000

In [None]:
required_keys = ['PMID', 'TI', 'AB', 'AD', 'FAU', 'DP', 'TA', 'JT', 'LA', 
                 'MH', 'OAB', 'OT', 'PL', 'PST', 'PT', 'RN', 'SI']

In [None]:
main_data = list()

for batch_data in tqdm(all_data):
  for single_data in batch_data:
    temp = list()
    for key in required_keys:
      value = single_data.get(key, "NONE")
      if isinstance(value, list):
        value = ";".join(set(value))
        temp.append(value)
      else:
        temp.append(value)

    main_data.append(temp)      

100%|██████████| 21/21 [00:04<00:00,  5.12it/s]


In [None]:
main_dataframe = pd.DataFrame(main_data, columns=required_keys)

In [None]:
main_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204668 entries, 0 to 204667
Data columns (total 17 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   PMID    204668 non-null  object
 1   TI      204668 non-null  object
 2   AB      204668 non-null  object
 3   AD      204668 non-null  object
 4   FAU     204668 non-null  object
 5   DP      204668 non-null  object
 6   TA      204668 non-null  object
 7   JT      204668 non-null  object
 8   LA      204668 non-null  object
 9   MH      204668 non-null  object
 10  OAB     204668 non-null  object
 11  OT      204668 non-null  object
 12  PL      204668 non-null  object
 13  PST     204668 non-null  object
 14  PT      204668 non-null  object
 15  RN      204668 non-null  object
 16  SI      204668 non-null  object
dtypes: object(17)
memory usage: 26.5+ MB


In [None]:
main_dataframe.head(2)

Unnamed: 0,PMID,TI,AB,AD,FAU,DP,TA,JT,LA,MH,OAB,OT,PL,PST,PT,RN,SI
0,35052023,Does a Mental Health Diagnosis Worsen Outcomes...,NONE,"Department of Psychiatry, Icahn School of Medi...","Vadukapuram, Ramu;Mansuri, Zeeshan;Trivedi, Ch...",2022 Jan 20,Prim Care Companion CNS Disord,The primary care companion for CNS disorders,eng,NONE,NONE,NONE,United States,epublish,Journal Article,NONE,NONE
1,35052021,The Opioid Epidemic and the COVID-19 Pandemic:...,NONE,"Department of Psychiatry, AdventHealth Orlando...","Das, Aparna;Padala, Akshay P;Wilson, Kerrie B",2022 Jan 20,Prim Care Companion CNS Disord,The primary care companion for CNS disorders,eng,NONE,NONE,NONE,United States,epublish,Journal Article,NONE,NONE


In [None]:
main_dataframe.to_csv("all-covid-19-pubmed-metadata.csv", index=False)

In [None]:
! cp -r /content/all-covid-19-pubmed-metadata.csv /content/drive/MyDrive/04-Work/EDA-Comparison-21st-Epidemics-PubMed

# Citation

In [None]:
def get_citation_count(pmid):

  url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&linkname=pubmed_pubmed_citedin&id={pmid}"

  url_result = urllib.request.urlopen(url)

  esearch_url_result = url_result.read().decode('utf-8')

  pmids_list = set(re.findall("<Id>(\d+?)</Id>", esearch_url_result))

  pmids_list.remove(pmid)

  return len(pmids_list), ";".join(list(pmids_list))


In [None]:
pmid = '34986503'

url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&linkname=pubmed_pubmed_citedin&id={pmid}"

url_result = urllib.request.urlopen(url)

esearch_url_result = url_result.read().decode('utf-8')

pmids_list = set(re.findall("<Id>(\d+?)</Id>", esearch_url_result))

pmids_list.remove(pmid)

In [None]:
len(pmids_list)

0

In [None]:
fin_end = time.time()



In [None]:
print(round(fin_end - search_start)/60)

63.71666666666667
