Skip to content
This repository has been archived by the owner on Mar 1, 2023. It is now read-only.

Commit

Permalink
Managing revisions gracefully
Browse files Browse the repository at this point in the history
  • Loading branch information
rabdill committed Jul 25, 2018
1 parent 7450e96 commit bc97264
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 16 deletions.
5 changes: 4 additions & 1 deletion api/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,10 @@ def fetch_table_data(self, table):
cursor.execute("SELECT column_name FROM information_schema.columns WHERE table_name='{}';".format(table))
for result in cursor:
headers.append(result[0])
cursor.execute("SELECT * FROM {};".format(table))
extra = ""
if table == "articles":
extra = " ORDER BY last_crawled DESC"
cursor.execute("SELECT * FROM {}{} LIMIT 100;".format(table, extra))
for result in cursor: # can't just return the cursor; it's closed when this function returns
data.append(result)
finally:
Expand Down
4 changes: 3 additions & 1 deletion api/endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,9 @@ def author_details(connection, id):
articles = connection.read("SELECT alltime_ranks.rank, ytd_ranks.rank, articles.id, articles.url, articles.title, articles.abstract, articles.collection, articles.collection_rank FROM articles INNER JOIN article_authors ON article_authors.article=articles.id LEFT JOIN alltime_ranks ON articles.id=alltime_ranks.article LEFT JOIN ytd_ranks ON articles.id=ytd_ranks.article WHERE article_authors.author={}".format(id))

alltime_count = connection.read("SELECT COUNT(article) FROM alltime_ranks")
alltime_count = alltime_count[0][0]
alltime_count = alltime_count[0][0]
# NOTE: alltime_count will not be a count of all the papers on the site,
# it excludes papers that don't have any traffic data.

for article in articles:
result["articles"].append({
Expand Down
2 changes: 1 addition & 1 deletion spider/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def _ensure_database_exists(self, dbname, host, user, password):
db.close()

def _ensure_tables_exist(self):
self.cursor.execute("CREATE TABLE IF NOT EXISTS articles (id SERIAL PRIMARY KEY, url text UNIQUE, title text NOT NULL, abstract text, collection text, collection_rank integer, title_vector tsvector, abstract_vector tsvector, last_crawled DATE NOT NULL DEFAULT CURRENT_DATE);")
self.cursor.execute("CREATE TABLE IF NOT EXISTS articles (id SERIAL PRIMARY KEY, url text UNIQUE, title text NOT NULL, abstract text, doi text UNIQUE, collection text, collection_rank integer, title_vector tsvector, abstract_vector tsvector, last_crawled DATE NOT NULL DEFAULT CURRENT_DATE);")
self.cursor.execute("CREATE TABLE IF NOT EXISTS authors (id SERIAL PRIMARY KEY, given text NOT NULL, surname text, UNIQUE (given, surname));")
self.cursor.execute("CREATE TABLE IF NOT EXISTS article_authors (id SERIAL PRIMARY KEY, article integer NOT NULL, author integer NOT NULL, UNIQUE (article, author));")
self.cursor.execute("CREATE TABLE IF NOT EXISTS article_traffic (id SERIAL PRIMARY KEY, article integer NOT NULL, month integer, year integer NOT NULL, abstract integer, pdf integer, UNIQUE (article, month, year));")
Expand Down
63 changes: 50 additions & 13 deletions spider/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def process_results_entry(self, html, collection):
self._find_title(html)
self._find_url(html)
self._find_authors(html)
self._find_doi(html)
self.collection = collection
# NOTE: We don't get abstracts from search result pages
# because they're loaded asynchronously and it would be
Expand All @@ -53,6 +54,17 @@ def _find_title(self, html):
# this looks weird because the title is wrapped
# in 2 <span> tags with identical classes:
self.title = x[0].text

def _find_doi(self, html):
x = html.find(".highwire-cite-metadata-doi")
if len(x) == 0:
return
try:
m = re.search('https://doi.org/(.*)', x[0].text)
except:
return
if len(m.groups()) > 0:
self.doi = m.group(1)

def _find_url(self, html):
self.url = html.absolute_links.pop() # absolute_links is a set
Expand All @@ -70,23 +82,41 @@ def _find_authors(self, html):
last = entry.find(".nlm-surname")[0].text
self.authors.append(Author(first, last))

def record(self, connection):
def record(self, connection, spider):
with connection.db.cursor() as cursor:
try:
cursor.execute("INSERT INTO articles (url, title, collection) VALUES (%s, %s, %s) RETURNING id;", (self.url, self.title, self.collection))
except psycopg2.IntegrityError as err:
if repr(err).find('duplicate key value violates unique constraint "articles_pkey"', 1):
# check to see if we've seen this article before
responses = []
cursor.execute("SELECT url FROM articles WHERE doi=%s", (self.doi,))
for x in cursor: # TODO: Look at using cursor.fetchone() here
responses.append(x)
if len(responses) > 0:
if responses[0] == self.url:
print("Found article already: {}".format(self.title))
connection.db.commit()
return False
else:
raise
cursor.execute("UPDATE articles SET url=%s, title=%s, collection=%s WHERE doi=%s RETURNING id;", (self.url, self.title, self.collection, self.doi))
print("Updated revision for article DOI {}: {}".format(self.doi, self.title))
# TODO: Update AUTHORS for revisions. This will be annoying.
connection.db.commit()
return True
# If it's brand new:
with connection.db.cursor() as cursor:
try:
cursor.execute("INSERT INTO articles (url, title, doi, collection) VALUES (%s, %s, %s, %s) RETURNING id;", (self.url, self.title, self.doi, self.collection))
finally:
connection.db.commit() # Needed to end the botched transaction
self.id = cursor.fetchone()[0]

author_ids = self._record_authors(connection)
self._link_authors(author_ids, connection)
print("Recorded article {}".format(self.title))

# fetch traffic stats for the new article
# TODO: this should be a method for Article, not Spider
print("Recording stats for new article:")
stat_table = spider.get_article_stats(self.url)
spider.save_article_stats(self.id, stat_table)
return True

def _record_authors(self, connection):
Expand Down Expand Up @@ -131,6 +161,7 @@ def find_record_new_articles(self, collection):

pagecount = 10 if TESTING else determine_page_count(r.html) # Also just for testing TODO delete
for p in range(1, pagecount): # iterate through pages
print("\n---\n\nFetching page {} in {}".format(p+1, collection)) # pages are zero-indexed
r = self.session.get("https://www.biorxiv.org/collection/{}?page={}".format(collection, p))
results = pull_out_articles(r.html, collection)
keep_going = self.record_articles(results)
Expand All @@ -150,7 +181,7 @@ def refresh_article_stats(self, collection):
print("Refreshing article download stats...")
with self.connection.db.cursor() as cursor:
# TODO: Add "where" clause based on last_crawled date (also UPDATE that value!)
cursor.execute("SELECT id, url FROM articles WHERE collection=%s;", (collection,))
cursor.execute("SELECT id, url FROM articles WHERE collection=%s AND last_crawled < now() - interval '1 month';", (collection,))
for article in cursor:
url = article[1]
article_id = article[0]
Expand Down Expand Up @@ -203,12 +234,15 @@ def save_article_stats(self, article_id, stats):
sql = "INSERT INTO article_traffic (article, month, year, abstract, pdf) VALUES (%s, %s, %s, %s, %s);"
params = [(article_id, x[0], x[1], x[2], x[3]) for x in to_record]
cursor.executemany(sql, params)
print("Recorded {} stats for ID {}".format(cursor.rowcount, article_id))

cursor.execute("UPDATE articles SET last_crawled = CURRENT_DATE WHERE id=%s", (article_id,))

print("Recorded {} stats for ID {}".format(len(to_record), article_id))
self.connection.db.commit()

def rank_articles(self):
# pulls together all the separate ranking calls
# self._rank_articles_alltime()
self._rank_articles_alltime()
categories = []
with self.connection.db.cursor() as cursor:
cursor.execute("SELECT DISTINCT collection FROM articles ORDER BY collection;")
Expand All @@ -225,7 +259,7 @@ def _rank_articles_alltime(self):
print("Ranking papers by popularity...")
with self.connection.db.cursor() as cursor:
cursor.execute("TRUNCATE alltime_ranks_working")
cursor.execute("SELECT article, SUM(pdf) as downloads FROM article_traffic GROUP BY article ORDER BY downloads DESC") # LIMIT 50")
cursor.execute("SELECT article, SUM(pdf) as downloads FROM article_traffic GROUP BY article ORDER BY downloads DESC")
sql = "INSERT INTO alltime_ranks_working (article, rank, downloads) VALUES (%s, %s, %s);"
params = [(record[0], rank, record[1]) for rank, record in enumerate(cursor, start=1)]
cursor.executemany(sql, params)
Expand Down Expand Up @@ -304,7 +338,7 @@ def update_article(self, article_id, abstract):
def record_articles(self, articles):
# return value is whether we encountered any articles we had already
for x in articles:
if not x.record(self.connection): return False
if not x.record(self.connection, self): return False # TODO: don't pass the whole damn spider here
return True

def calculate_vectors(self):
Expand All @@ -328,7 +362,10 @@ def full_run(spider, collection="bioinformatics"):
full_run(spider)
elif sys.argv[1] == "rankings":
spider.rank_articles()
elif sys.argv[1] == "tsvectors":
spider.calculate_vectors()
elif sys.argv[1] == "traffic":
if len(sys.argv) > 2:
spider.refresh_article_stats(sys.argv[2])
else:
print("Must specify collection to refresh traffic stats for.")
else:
full_run(spider, sys.argv[1])

0 comments on commit bc97264

Please sign in to comment.