In [1]:
import sys, signal
from PyQt4 import QtCore, QtGui, QtWebKit

class WebPage(QtWebKit.QWebPage):  
    def __init__(self, db):
        self._db = db
        QtWebKit.QWebPage.__init__(self)
        self.mainFrame().loadFinished.connect(self.handleLoadFinished)

    def process(self, items):
        self._items = iter(items)
        self.fetchNext() 
    
    def fetchNext(self):
        try:
            self._url, self._func = next(self._items)
            self.mainFrame().load(QtCore.QUrl(self._url))
        except StopIteration:
            return False
        return True
    
    def handleLoadFinished(self):
        self._func(self._url, self.mainFrame().toHtml())
        if not self.fetchNext():
            print('CONGRATULATIONS # processing complete')
            QtGui.qApp.quit()

    

In [2]:
# load the list of editions conveniently saved as json

import json

load_path = "data/ploscompbio_editions.json"

with open(load_path, 'r') as infile:  
    list_of_editions = json.load(infile)
    
#print(list_of_editions)

In [3]:
# initialize database

import sqlite3

db = sqlite3.connect('data/articles8.db')
cursor = db.cursor()
cursor.execute('''CREATE TABLE articles8(id INTEGER PRIMARYKEY,
                    url TEXT, authors TEXT, date TEXT, title TEXT, abstract TEXT, author_summary TEXT)
                    ''')


db.commit()





In [None]:
# note - using global db variable here

import urllib
from bs4 import BeautifulSoup

def process_url(url,html):
    print(url)
    print('testing database entry...')
    
    # get the list of articles from the edition
    r = str(html.toAscii())
    edition_soup = BeautifulSoup(r,"lxml")
    edition_sections = edition_soup.select('.section')
    for section in edition_sections:
        sectionID = section.select('a')[0].id
        print(sectionID)
        if sectionID == "Research_Article":
            research_articles_section = section
        
    #try:
    #    research_articles_section = edition_soup.select('.section')[4]
    #except:
    #    research_articles_section = edition_soup.select('.section')[3] # for the javascript loaded pages
    #research_articles = research_articles_section.select('.item')

    for sample_article in research_articles:
        url = sample_article.select('a')[1].text
        print(url)
        
        authors = sample_article.select('p')[0].text
        #authors = authors.split(',') # todo normalize names for whitespace, \n, etc # wait, no do this on removal from database

        date = sample_article.select('p')[1].text.split('|')[0]
        date = date.split('\n')[1] # todo express this as a datetime object (check formatting across articles)

        title = sample_article.select('a')[0].text
        journal = "PLOS Computational Biology" 
        
        print("requesting article...")
        # go the the article url and read text
        r = urllib.urlopen(url).read()
        article_soup = BeautifulSoup(r,"lxml")
        abstract = article_soup.select("div.abstract")[0].text[8:] # get abstract from html & pick off first 8 characters,
                                        # these characters read, "Abstract"
        try:
            author_summary = article_soup.select("div.abstract")[1].text[15:] # "author summary" 
        except: # a few articles don't have author summaries
            author_summary = "No author summary found"
        
        article_data_object = {"url":url, "authors":authors, "date":date, "title":title,"abstract":abstract,
                                  "author_summary":author_summary}
        
        # add article data object to a database        
        cursor.execute('''INSERT INTO articles8(url, authors, date, title, abstract, author_summary)
                VALUES(:url, :authors, :date, :title, :abstract, :author_summary)''',
                article_data_object)

        db.commit()
        print('entry added to database')

In [None]:
edition_urls = [ed['url'] for ed in list_of_editions]
edition_urls = edition_urls

items = [ (url,process_url) for url in edition_urls]

signal.signal(signal.SIGINT, signal.SIG_DFL)
print('press ctrl+c to quit\n')
app = QtGui.QApplication(sys.argv)
webpage = WebPage(db)
webpage.process(items)
sys.exit(app.exec_())

db.close()
print('database closed')


press ctrl+c to quit

http://journals.plos.org/ploscompbiol/issue?id=10.1371/issue.pcbi.v14.i01
testing database entry...
https://doi.org/10.1371/journal.pcbi.1005973
requesting article...




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


entry added to database
https://doi.org/10.1371/journal.pcbi.1005931
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005953
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005949
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005968
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005951
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005943
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005930
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005944
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005952
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005974
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005941
requesting article...

entry added to database
https://doi.org/10.1371/journal.pcbi.1006078
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006080
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005977
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005877
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006070
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006032
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006056
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006046
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005985
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006059
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005897
requesting article...

entry added to database
https://doi.org/10.1371/journal.pcbi.1006040
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1005996
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006079
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006095
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006067
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006044
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006076
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006092
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006084
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006075
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006053
requesting article...

entry added to database
https://doi.org/10.1371/journal.pcbi.1006201
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006145
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006229
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006195
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006143
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006187
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006205
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006182
requesting article...
entry added to database
https://doi.org/10.1371/journal.pcbi.1006178
requesting article...
entry added to database
http://journals.plos.org/ploscompbiol/issue?id=10.1371/issue.pcbi.v14.i07
testing database entry...


IndexError: list index out of range