### Scraping literature

1) The following code gets a list of lines from the ABIDEII site and filters out the unwanted ones, mostly subject lists and IEEE papers.

2) It then uses either pubmed_lookup if the link is to a pubmed id number or elsapy if the paper is from sciencedirect.

3) The data that is returned is formatted into a summary with title, authors, journal, year of publicaion, links to the article and the abstract.

4) The information is written to a text file called 'summary.txt'

In [None]:
import requests
import bs4
from pubmed_lookup import PubMedLookup, Publication
from elsapy.elsclient import ElsClient
from elsapy.elsdoc import FullDoc

##=============================================================================================##
## scrape ABIDEII site for publication links
r = requests.get('http://fcon_1000.projects.nitrc.org/indi/abide/manuscripts.html')
soup = bs4.BeautifulSoup(r.text,'html.parser')

pmids = [link['href'] for link in soup.find_all('a') \ ## get all links
         if (link['href'][-3:].isdigit()) \ ## that have pubmed ids at end
         and ('sublist' not in link['href']) \ ## links that are not subjects lists
         and not (link['href'].startswith('http://ieeexplore.ieee.org')) ## without ieee papers
        ]
##=============================================================================================##
## Load configuration for Elsevier
con_file = open("config.json")
config = json.load(con_file)
con_file.close()

## Initialize client for Elsevier
client = ElsClient(config['apikey'])
##=============================================================================================##

with open('summary.txt','a') as outfile:

    # NCBI will contact user by email if excessive queries are detected
    email = 'alistair.walsh@unimelb.edu.au'
    for url in pmids:
        interface = url.rsplit('/',2)[-2] # second last subdomain is either pubmed or pii 
        
        if interface == 'pubmed':
            print(url)
            lookup = PubMedLookup(url, email)
            try:
                publication = Publication(lookup) 

                print(
                """
TITLE:{title}
AUTHORS:{authors}
JOURNAL:{journal}
YEAR:{year}
URL:{url}
PUBMED:{pubmed}
ABSTRACT:\n{abstract}\n
                """
                .format(**{
                    'title': publication.title,
                    'authors': publication.authors,
                    'journal': publication.journal,
                    'year': publication.year,
                    'url': publication.url,
                    'pubmed': publication.pubmed_url,
                    'abstract': repr(publication.abstract),
                }),file=outfile)

            except:
                print("###### couldn't get url = ",url)
        
        elif interface == 'pii':
            print(url)
            id_num = url.rsplit('/',2)[-1] # elsapy requires id number (pubmed_lookup requires url)
            ## ScienceDirect (full-text) document example using PII
            pii_doc = FullDoc(sd_pii = id_num)
            if pii_doc.read(client):
                print(
                """
TITLE:{title}
AUTHORS:{authors}
JOURNAL:{journal}
YEAR:{year}
URL:{url}
PII:{sciencedirect}
ABSTRACT:\n{abstract}
                """
                .format(**{
                    'title': pii_doc.title,
                    'authors': '; '.join([a['$'] for a in pii_doc.data['coredata']['dc:creator']]),
                    'journal': pii_doc.data['coredata']['prism:publicationName'],
                    'year': pii_doc.data['coredata']['prism:coverDate'].split('-')[1],
                    'url': pii_doc.data['coredata']['prism:url'],
                    'sciencedirect': id_num,
                    'abstract': pii_doc.data['coredata']['dc:description'],
                }),file=outfile)  
            else:
                print ("Read document failed.")
        else:
            print('failed: neither pubmed or sciencedirect')