# Metadata

```yaml
course:   DS 5001 
topic:    Pipeline
author:   Andrew Chaphiv (acgq2@virginia.edu)
date:    SPR2023
```

# Importing Modules 

In [None]:
import pandas as pd 
import csv
import re
import urllib
from time import sleep

# Using API

In [168]:
query = 'cancer+treatment'

# common settings between esearch and efetch
base_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
db = 'db=pubmed'

# esearch specific settings
search_eutil = 'esearch.fcgi?'
search_term = '&term=' + query
search_usehistory = '&usehistory=y'
search_rettype = '&rettype=json'
search_mindate, search_maxdate = '&mindate=2005','&maxdate=2005'

In [169]:
search_url = base_url+search_eutil+db+search_term+search_usehistory+search_rettype+search_mindate+search_maxdate
print(search_url)

http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=cancer+treatment&usehistory=y&rettype=json&mindate=2005&maxdate=2005


In [173]:
f = urllib.request.urlopen(search_url)
search_data = f.read().decode('utf-8')
search_data

'<?xml version="1.0" encoding="UTF-8" ?>\n<!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD esearch 20060628//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd">\n<eSearchResult><Count>57057</Count><RetMax>20</RetMax><RetStart>0</RetStart><QueryKey>1</QueryKey><WebEnv>MCID_6413a2172764c94de824c8e8</WebEnv><IdList>\n<Id>16385691</Id>\n<Id>16385690</Id>\n<Id>16385665</Id>\n<Id>16385661</Id>\n<Id>16385656</Id>\n<Id>16385650</Id>\n<Id>16385648</Id>\n<Id>16385625</Id>\n<Id>16385613</Id>\n<Id>16385577</Id>\n<Id>16385575</Id>\n<Id>16385573</Id>\n<Id>16385570</Id>\n<Id>16385569</Id>\n<Id>16385568</Id>\n<Id>16385565</Id>\n<Id>16385556</Id>\n<Id>16385476</Id>\n<Id>16385440</Id>\n<Id>16385437</Id>\n</IdList><TranslationSet><Translation>     <From>cancer</From>     <To>"cancer\'s"[All Fields] OR "cancerated"[All Fields] OR "canceration"[All Fields] OR "cancerization"[All Fields] OR "cancerized"[All Fields] OR "cancerous"[All Fields] OR "neoplasms"[MeSH Terms] OR "neoplasms"[All Fields

In [174]:
total_abstract_count = int(re.findall("<Count>(\d+?)</Count>",search_data)[0])
print(total_abstract_count)

57057


In [175]:
fetch_webenv = "&WebEnv=" + re.findall ("<WebEnv>(\S+)<\/WebEnv>", search_data)[0]
fetch_querykey = "&query_key=" + re.findall("<QueryKey>(\d+?)</QueryKey>",search_data)[0]

In [176]:
fetch_eutil = 'efetch.fcgi?'
retmax = 500
retstart = 0
fetch_retstart = "&retstart=" + str(retstart)
fetch_retmax = "&retmax=" + str(retmax)
fetch_retmode = "&retmode=text"
fetch_rettype = "&rettype=abstract"

In [177]:
fetch_url = base_url+fetch_eutil+db+fetch_querykey+fetch_webenv+fetch_retstart+fetch_retmax+fetch_retmode+fetch_rettype
f = urllib.request.urlopen (fetch_url)
fetch_data = f.read().decode('utf-8')
# split the data into individual abstracts
abstracts = fetch_data.split("\n\n\n")

In [178]:
fetch_url

'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&query_key=1&WebEnv=MCID_6413a2172764c94de824c8e8&retstart=0&retmax=500&retmode=text&rettype=abstract'

# Batch Grabbing the Abstracts

In [10]:
years = ['1995',
         '2000',
         '2005',
         '2010',
         '2015',
         '2020',
         '2023']

In [180]:
query = "cancer+treatment"

# common settings between esearch and efetch
base_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
db = 'db=pubmed'

# esearch settings
search_eutil = 'esearch.fcgi?'
search_term = '&term=' + query
search_usehistory = '&usehistory=y'
search_rettype = '&rettype=json'
total_abstract_count = 2000
fetch_eutil = 'efetch.fcgi?'
retmax = 500
retstart = 0
fetch_retmode = "&retmode=text"
fetch_rettype = "&rettype=abstract"

for i in years:
    search_mindate, search_maxdate = '&mindate='+i,'&maxdate='+i
    search_url = base_url+search_eutil+db+search_term+search_usehistory+search_rettype+search_mindate+search_maxdate
    print("this is the esearch command:\n" + search_url + "\n")
    f = urllib.request.urlopen (search_url)
    search_data = f.read().decode('utf-8')
    fetch_webenv = "&WebEnv=" + re.findall ("<WebEnv>(\S+)<\/WebEnv>", search_data)[0]
    fetch_querykey = "&query_key=" + re.findall("<QueryKey>(\d+?)</QueryKey>",search_data)[0]
    run = True
    all_abstracts = list()
    loop_counter = 1
    while run:
        loop_counter += 1
        fetch_retstart = "&retstart=" + str(retstart)
        fetch_retmax = "&retmax=" + str(retmax)
        # create the efetch url
        fetch_url = base_url+fetch_eutil+db+fetch_querykey+fetch_webenv+fetch_retstart+fetch_retmax+fetch_retmode+fetch_rettype
        # open the efetch url
        f = urllib.request.urlopen (fetch_url)
        fetch_data = f.read().decode('utf-8')
        # split the data into individual abstracts
        abstracts = fetch_data.split("\n\n\n")
        # append to the list all_abstracts
        all_abstracts = all_abstracts+abstracts
        # update retstart to download the next chunk of abstracts
        retstart = retstart + retmax
        if retstart > total_abstract_count:
            print("a total of " + str(len(all_abstracts)) + " abstracts have been downloaded.\n")
            run = False
    with open("abstracts{}.csv".format(i), "wt", encoding = 'utf-8') as abstracts_file:
        # csv writer for full abstracts
        abstract_writer = csv.writer(abstracts_file)
        abstract_writer.writerow(['Journal', 'Title', 'Authors', 'Author_Information', 'Abstract', 'DOI', 'Misc'])
        #For each abstract, split into categories and write it to the csv file
        for abstract in all_abstracts:
            #To obtain categories, split every double newline.
            split_abstract = abstract.split("\n\n")
            if 5 <= len(split_abstract) <= 7:
                abstract_writer.writerow(split_abstract)
                retstart = 0

this is the esearch command:
http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=cancer+treatment&usehistory=y&rettype=json&mindate=1995&maxdate=1995

a total of 2460 abstracts have been downloaded.

this is the esearch command:
http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=cancer+treatment&usehistory=y&rettype=json&mindate=2000&maxdate=2000

a total of 2460 abstracts have been downloaded.

this is the esearch command:
http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=cancer+treatment&usehistory=y&rettype=json&mindate=2005&maxdate=2005

a total of 2460 abstracts have been downloaded.

this is the esearch command:
http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=cancer+treatment&usehistory=y&rettype=json&mindate=2010&maxdate=2010

a total of 2460 abstracts have been downloaded.

this is the esearch command:
http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=cancer+treatme

# Cleaning Up

In [181]:
# Now that we have all the abstracts, we can clean them up and condense them into one big df
temp = pd.DataFrame()
df = pd.read_csv('abstracts1995.csv')

# Only care about abstract, title, and year, drop everything else 
df = df[["Title", "Authors", "Abstract"]]
df["Year"] = "1995"
df["Abstract"] = df["Abstract"].str.split('\r\n')
df["Len"] = df["Abstract"].apply(len)
df = df[df["Len"] > 3]
df["Abstract"] = df["Abstract"].apply(lambda x: ''.join(x))
# Some abstracts are misplaced, we can just drop the articles where there are no abstracts. 
df = df[["Title", "Authors", "Abstract", "Year"]].reset_index(drop = True)
df1 = df.copy()
pd.concat([temp, df, df1],ignore_index = True)

Unnamed: 0,Title,Authors,Abstract,Year
0,Stimulation of the sphingomyelin pathway induc...,"Fiebich BL(1), Lieb K, Berger M, Bauer J.",Interleukin-6 (IL-6) has previously been shown...,1995
1,Non-parametric estimation of the post-lead-tim...,"Xu JL(1), Prorok PC.",The goal of screening programmes for cancer is...,1995
2,Non-parametric methods for analysing recurrent...,"Lancar R(1), Kramar A, Haie-Meder C.",Non-parametric methods have recently been prop...,1995
3,Effect of serum albumin on estrogen metabolism...,"Bradlow HL(1), Arcuri F, Blasi L, Castagnetta L.",The observation that charcoal-treated fetal bo...,1995
4,Effect of a new de-N-acetyl-lysoglycosphingoli...,"Tubaro E(1), Borelli GP, Belogi L, Cavallo G, ...",A new de-N-acetylated glycosphingolipid termed...,1995
...,...,...,...,...
3021,Inhibition of metabolism of 4-(methylnitrosami...,"Morse MA(1), Kresty LA, Toburen AL.","As part of a routine screening assay, benzalde...",1995
3022,Overexpressions of c-fos/jun mRNA and their on...,"Morishita S(1), Niwa K, Ichigo S, Hori M, Mura...",To further understand hormonal carcinogenesis ...,1995
3023,Cytokine induction by 41.8 degrees C whole bod...,"Robins HI(1), Kutz M, Wiedemann GJ, Katschinsk...",The potential for 41.8 degrees C whole body hy...,1995
3024,Introduction of murine Il-4 gene into B16(F10)...,"Missol E(1), Sochanik A, Szala S.",Il-4 is a highly pleiotropic cytokine which in...,1995


In [182]:
df = pd.DataFrame()

In [183]:
# Ok do it for every single abstract
for i in years: 
    df1 = pd.read_csv('abstracts{}.csv'.format(i))

    # Only care about abstract, title, and year, drop everything else 
    df1 = df1[["Title", "Authors", "Abstract"]]
    df1["Year"] = i
    df1["Abstract"] = df1["Abstract"].str.split('\r\n')
    df1["Len"] = df1["Abstract"].apply(len)
    df1 = df1[df1["Len"] > 3]
    df1["Abstract"] = df1["Abstract"].apply(lambda x: ''.join(x))
    # Some abstracts are misplaced, we can just drop the articles where there are no abstracts. 
    df1 = df1[["Title", "Authors", "Abstract", "Year"]].reset_index(drop = True)
    df = pd.concat([df,df1],ignore_index = True)



In [184]:
df["Year"].value_counts() ## Abstracts per year

2015    1916
2010    1769
2000    1716
2005    1710
1995    1513
2020    1331
2023    1308
Name: Year, dtype: int64

In [201]:
df["Comment"] = df["Abstract"].str.find("Comment")
df = df[~df["Comment"] != -1]
df = df[["Title", "Authors", "Abstract", "Year"]]
df["Abstract Length"] = df["Abstract"].apply(len)
df.to_csv("condensedabstracts.csv")
