Managing revisions gracefully

blekhmanlab · Jul 25, 2018 · bc97264 · bc97264
1 parent 7450e96
commit bc97264
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 16 deletions.
diff --git a/api/db.py b/api/db.py
@@ -47,7 +47,10 @@ def fetch_table_data(self, table):
         cursor.execute("SELECT column_name FROM information_schema.columns WHERE table_name='{}';".format(table))
         for result in cursor:
           headers.append(result[0])
-        cursor.execute("SELECT * FROM {};".format(table))
+        extra = ""
+        if table == "articles":
+          extra = " ORDER BY last_crawled DESC"
+        cursor.execute("SELECT * FROM {}{} LIMIT 100;".format(table, extra))
         for result in cursor: # can't just return the cursor; it's closed when this function returns
           data.append(result)
       finally:

diff --git a/api/endpoints.py b/api/endpoints.py
@@ -233,7 +233,9 @@ def author_details(connection, id):
   articles = connection.read("SELECT alltime_ranks.rank, ytd_ranks.rank, articles.id, articles.url, articles.title, articles.abstract, articles.collection, articles.collection_rank FROM articles INNER JOIN article_authors ON article_authors.article=articles.id LEFT JOIN alltime_ranks ON articles.id=alltime_ranks.article LEFT JOIN ytd_ranks ON articles.id=ytd_ranks.article WHERE article_authors.author={}".format(id))
 
   alltime_count = connection.read("SELECT COUNT(article) FROM alltime_ranks")
-  alltime_count = alltime_count[0][0]
+  alltime_count = alltime_count[0][0] 
+  # NOTE: alltime_count will not be a count of all the papers on the site,
+  # it excludes papers that don't have any traffic data.
 
   for article in articles:
     result["articles"].append({

diff --git a/spider/db.py b/spider/db.py
@@ -26,7 +26,7 @@ def _ensure_database_exists(self, dbname, host, user, password):
     db.close()
 
   def _ensure_tables_exist(self):
-    self.cursor.execute("CREATE TABLE IF NOT EXISTS articles (id SERIAL PRIMARY KEY, url text UNIQUE, title text NOT NULL, abstract text, collection text, collection_rank integer, title_vector tsvector, abstract_vector tsvector, last_crawled DATE NOT NULL DEFAULT CURRENT_DATE);")
+    self.cursor.execute("CREATE TABLE IF NOT EXISTS articles (id SERIAL PRIMARY KEY, url text UNIQUE, title text NOT NULL, abstract text, doi text UNIQUE, collection text, collection_rank integer, title_vector tsvector, abstract_vector tsvector, last_crawled DATE NOT NULL DEFAULT CURRENT_DATE);")
     self.cursor.execute("CREATE TABLE IF NOT EXISTS authors (id SERIAL PRIMARY KEY, given text NOT NULL, surname text, UNIQUE (given, surname));")
     self.cursor.execute("CREATE TABLE IF NOT EXISTS article_authors (id SERIAL PRIMARY KEY, article integer NOT NULL, author integer NOT NULL, UNIQUE (article, author));")
     self.cursor.execute("CREATE TABLE IF NOT EXISTS article_traffic (id SERIAL PRIMARY KEY, article integer NOT NULL, month integer, year integer NOT NULL, abstract integer, pdf integer, UNIQUE (article, month, year));")

diff --git a/spider/spider.py b/spider/spider.py
@@ -43,6 +43,7 @@ def process_results_entry(self, html, collection):
     self._find_title(html)
     self._find_url(html)
     self._find_authors(html)
+    self._find_doi(html)
     self.collection = collection
     # NOTE: We don't get abstracts from search result pages
     # because they're loaded asynchronously and it would be
@@ -53,6 +54,17 @@ def _find_title(self, html):
     # this looks weird because the title is wrapped
     # in 2 <span> tags with identical classes:
     self.title = x[0].text
+
+  def _find_doi(self, html):
+    x = html.find(".highwire-cite-metadata-doi")
+    if len(x) == 0:
+      return
+    try:
+      m = re.search('https://doi.org/(.*)', x[0].text)
+    except:
+      return
+    if len(m.groups()) > 0:
+      self.doi = m.group(1)
 
   def _find_url(self, html):
     self.url = html.absolute_links.pop() # absolute_links is a set
@@ -70,23 +82,41 @@ def _find_authors(self, html):
         last = entry.find(".nlm-surname")[0].text
       self.authors.append(Author(first, last))
 
-  def record(self, connection):
+  def record(self, connection, spider):
     with connection.db.cursor() as cursor:
-      try:
-        cursor.execute("INSERT INTO articles (url, title, collection) VALUES (%s, %s, %s) RETURNING id;", (self.url, self.title, self.collection))
-      except psycopg2.IntegrityError as err:
-        if repr(err).find('duplicate key value violates unique constraint "articles_pkey"', 1):
+      # check to see if we've seen this article before
+      responses = []
+      cursor.execute("SELECT url FROM articles WHERE doi=%s", (self.doi,))
+      for x in cursor: # TODO: Look at using cursor.fetchone() here
+        responses.append(x)
+      if len(responses) > 0:
+        if responses[0] == self.url:
           print("Found article already: {}".format(self.title))
+          connection.db.commit()
           return False
         else:
-          raise
+          cursor.execute("UPDATE articles SET url=%s, title=%s, collection=%s WHERE doi=%s RETURNING id;", (self.url, self.title, self.collection, self.doi))
+          print("Updated revision for article DOI {}: {}".format(self.doi, self.title))
+          # TODO: Update AUTHORS for revisions. This will be annoying.
+          connection.db.commit()
+          return True
+    # If it's brand new:
+    with connection.db.cursor() as cursor:
+      try:
+        cursor.execute("INSERT INTO articles (url, title, doi, collection) VALUES (%s, %s, %s, %s) RETURNING id;", (self.url, self.title, self.doi, self.collection))
       finally:
         connection.db.commit() # Needed to end the botched transaction
       self.id = cursor.fetchone()[0]
 
       author_ids = self._record_authors(connection)
       self._link_authors(author_ids, connection)
       print("Recorded article {}".format(self.title))
+
+      # fetch traffic stats for the new article
+      # TODO: this should be a method for Article, not Spider
+      print("Recording stats for new article:")
+      stat_table = spider.get_article_stats(self.url)
+      spider.save_article_stats(self.id, stat_table)
     return True
 
   def _record_authors(self, connection):
@@ -131,6 +161,7 @@ def find_record_new_articles(self, collection):
 
     pagecount = 10 if TESTING else determine_page_count(r.html) # Also just for testing TODO delete
     for p in range(1, pagecount): # iterate through pages
+      print("\n---\n\nFetching page {} in {}".format(p+1, collection)) # pages are zero-indexed
       r = self.session.get("https://www.biorxiv.org/collection/{}?page={}".format(collection, p))
       results = pull_out_articles(r.html, collection)
       keep_going = self.record_articles(results)
@@ -150,7 +181,7 @@ def refresh_article_stats(self, collection):
     print("Refreshing article download stats...")
     with self.connection.db.cursor() as cursor:
       # TODO: Add "where" clause based on last_crawled date (also UPDATE that value!)
-      cursor.execute("SELECT id, url FROM articles WHERE collection=%s;", (collection,))
+      cursor.execute("SELECT id, url FROM articles WHERE collection=%s AND last_crawled < now() - interval '1 month';", (collection,))
       for article in cursor:
         url = article[1]
         article_id = article[0]
@@ -203,12 +234,15 @@ def save_article_stats(self, article_id, stats):
       sql = "INSERT INTO article_traffic (article, month, year, abstract, pdf) VALUES (%s, %s, %s, %s, %s);"
       params = [(article_id, x[0], x[1], x[2], x[3]) for x in to_record]
       cursor.executemany(sql, params)
-      print("Recorded {} stats for ID {}".format(cursor.rowcount, article_id))
+
+      cursor.execute("UPDATE articles SET last_crawled = CURRENT_DATE WHERE id=%s", (article_id,))
+
+      print("Recorded {} stats for ID {}".format(len(to_record), article_id))
       self.connection.db.commit()
 
   def rank_articles(self):
     # pulls together all the separate ranking calls
-    # self._rank_articles_alltime()
+    self._rank_articles_alltime()
     categories = []
     with self.connection.db.cursor() as cursor:
       cursor.execute("SELECT DISTINCT collection FROM articles ORDER BY collection;")
@@ -225,7 +259,7 @@ def _rank_articles_alltime(self):
     print("Ranking papers by popularity...")
     with self.connection.db.cursor() as cursor:
       cursor.execute("TRUNCATE alltime_ranks_working")
-      cursor.execute("SELECT article, SUM(pdf) as downloads FROM article_traffic GROUP BY article ORDER BY downloads DESC") # LIMIT 50")
+      cursor.execute("SELECT article, SUM(pdf) as downloads FROM article_traffic GROUP BY article ORDER BY downloads DESC")
       sql = "INSERT INTO alltime_ranks_working (article, rank, downloads) VALUES (%s, %s, %s);"
       params = [(record[0], rank, record[1]) for rank, record in enumerate(cursor, start=1)]
       cursor.executemany(sql, params)
@@ -304,7 +338,7 @@ def update_article(self, article_id, abstract):
   def record_articles(self, articles):
     # return value is whether we encountered any articles we had already
     for x in articles:
-      if not x.record(self.connection): return False
+      if not x.record(self.connection, self): return False # TODO: don't pass the whole damn spider here
     return True
 
   def calculate_vectors(self):
@@ -328,7 +362,10 @@ def full_run(spider, collection="bioinformatics"):
     full_run(spider)
   elif sys.argv[1] == "rankings":
     spider.rank_articles()
-  elif sys.argv[1] == "tsvectors":
-    spider.calculate_vectors()
+  elif sys.argv[1] == "traffic":
+    if len(sys.argv) > 2:
+      spider.refresh_article_stats(sys.argv[2])
+    else:
+      print("Must specify collection to refresh traffic stats for.")
   else:
     full_run(spider, sys.argv[1])