In [1]:
import sys, signal
from PyQt4 import QtCore, QtGui, QtWebKit

class WebPage(QtWebKit.QWebPage):  
    def __init__(self, db):
        self._db = db
        QtWebKit.QWebPage.__init__(self)
        self.mainFrame().loadFinished.connect(self.handleLoadFinished)

    def process(self, items):
        self._items = iter(items)
        self.fetchNext() 
    
    def fetchNext(self):
        try:
            self._url, self._func = next(self._items)
            self.mainFrame().load(QtCore.QUrl(self._url))
        except StopIteration:
            return False
        return True
    
    def handleLoadFinished(self):
        self._func(self._url, self.mainFrame().toHtml())
        if not self.fetchNext():
            print('CONGRATULATIONS # processing complete')
            QtGui.qApp.quit()

    

In [2]:
# load the list of editions conveniently saved as json

import json

load_path = "data/ploscompbio_editions.json"

with open(load_path, 'r') as infile:  
    list_of_editions = json.load(infile)
    
#print(list_of_editions)

In [3]:
# initialize database

import sqlite3

db = sqlite3.connect('data/articles0.db')
cursor = db.cursor()
cursor.execute('''CREATE TABLE articles0(id INTEGER PRIMARYKEY,
                    url TEXT, authors TEXT, date TEXT, title TEXT, abstract TEXT, author_summary TEXT)
                    ''')


db.commit()





In [None]:
# note - using global db variable here

import urllib
from bs4 import BeautifulSoup

def process_url(url,html):
    print(url)
    print('testing database entry...')
    
    # get the list of articles from the edition
    r = str(html.toAscii())
    edition_soup = BeautifulSoup(r,"lxml")
    edition_sections = edition_soup.select('.section')
    for section in edition_sections:
        sectionID = section.select('a')[0]['id']
        print(sectionID)
        if sectionID == "Research_Article":
            research_articles_section = section
    research_articles = research_articles_section.select('.item')
        
    #try:
    #    research_articles_section = edition_soup.select('.section')[4]
    #except:
    #    research_articles_section = edition_soup.select('.section')[3] # for the javascript loaded pages
    #research_articles = research_articles_section.select('.item')

    for sample_article in research_articles:
        url = sample_article.select('a')[1].text
        print(url)
        
        authors = sample_article.select('p')[0].text
        #authors = authors.split(',') # todo normalize names for whitespace, \n, etc # wait, no do this on removal from database

        date = sample_article.select('p')[1].text.split('|')[0]
        date = date.split('\n')[1] # todo express this as a datetime object (check formatting across articles)

        title = sample_article.select('a')[0].text
        journal = "PLOS Computational Biology" 
        
        print("requesting article...")
        # go the the article url and read text
        r = urllib.urlopen(url).read()
        article_soup = BeautifulSoup(r,"lxml")
        abstract = article_soup.select("div.abstract")[0].text[8:] # get abstract from html & pick off first 8 characters,
                                        # these characters read, "Abstract"
        try:
            author_summary = article_soup.select("div.abstract")[1].text[15:] # "author summary" 
        except: # a few articles don't have author summaries
            author_summary = "No author summary found"
        
        article_data_object = {"url":url, "authors":authors, "date":date, "title":title,"abstract":abstract,
                                  "author_summary":author_summary}
        
        # add article data object to a database        
        cursor.execute('''INSERT INTO articles0(url, authors, date, title, abstract, author_summary)
                VALUES(:url, :authors, :date, :title, :abstract, :author_summary)''',
                article_data_object)

        db.commit()
        print('entry added to database')

In [None]:
edition_urls = [ed['url'] for ed in list_of_editions]
edition_urls = edition_urls

items = [ (url,process_url) for url in edition_urls]

signal.signal(signal.SIGINT, signal.SIG_DFL)
print('press ctrl+c to quit\n')
app = QtGui.QApplication(sys.argv)
webpage = WebPage(db)
webpage.process(items)
sys.exit(app.exec_())

db.close()
print('database closed')


press ctrl+c to quit

http://journals.plos.org/ploscompbiol/issue?id=10.1371/issue.pcbi.v14.i01
testing database entry...
Cover
Editorial
Education
Research_Article
https://doi.org/10.1371/journal.pcbi.1005973
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005931
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005953
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005949
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005968
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005951
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005943
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005930
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005944
requesting article...
entry added to database
https://doi.org/

entry added to database
https://doi.org/10.1371/journal.pcbi.1005959
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005987
requesting article...
entry added to database
http://journals.plos.org/ploscompbiol/issue?id=10.1371/issue.pcbi.v14.i03
testing database entry...
Cover
Editorial
Education
Research_Article
https://doi.org/10.1371/journal.pcbi.1006054
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006078
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006080
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005977
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005877
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006070
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006032
requesting article...
entry added to database
https://doi.or

entry added to database
https://doi.org/10.1371/journal.pcbi.1006101
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006096
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006094
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006082
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006060
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006040
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005996
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006079
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006095
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006067
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006044
requesting article...

entry added to database
https://doi.org/10.1371/journal.pcbi.1006241
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006208
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006167
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006204
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006233
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006222
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006201
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006145
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006229
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006195
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006143
requesting article...

entry added to database
https://doi.org/10.1371/journal.pcbi.1005306
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005228
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005338
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005327
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005319
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005309
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005299
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005291
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005257
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005337
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005316
requesting article...

entry added to database
https://doi.org/10.1371/journal.pcbi.1005449
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005397
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005446
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005427
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005445
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005394
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005432
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005433
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005385
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005415
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005423
requesting article...

entry added to database
https://doi.org/10.1371/journal.pcbi.1005528
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005517
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005495
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005541
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005539
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005537
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005536
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005515
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005530
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005527
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005418
requesting article...