Fetching articles independently of collections

blekhmanlab · Oct 29, 2018 · ebd2b9e · ebd2b9e
1 parent 601c278
commit ebd2b9e
Show file tree

Hide file tree

Showing 4 changed files with 115 additions and 32 deletions.
diff --git a/endpoints.py b/endpoints.py
@@ -26,7 +26,7 @@ def get_categories(connection):
 
   """
   results = []
-  categories = connection.read("SELECT DISTINCT collection FROM articles ORDER BY collection;")
+  categories = connection.read("SELECT DISTINCT collection FROM articles WHERE collection IS NOT NULL ORDER BY collection;")
   for cat in categories:
     if len(cat) > 0:
       results.append(cat[0])
@@ -287,6 +287,12 @@ def site_stats(connection):
         continue # something fishy with this entry
       outdated[entry[0]] = entry[1]
 
+  resp = connection.read("SELECT COUNT(id) FROM articles WHERE collection IS NULL;")
+  if len(resp) != 1 or len(resp[0]) != 1:
+    no_category = 0
+  else:
+    no_category = resp[0][0]
+
   resp = connection.read("""
   SELECT COUNT(id)
   FROM (
@@ -311,5 +317,6 @@ def site_stats(connection):
     "missing_abstract": no_abstract,
     "missing_date": no_posted,
     "outdated_count": outdated,
-    "missing_authors": no_authors
+    "missing_authors": no_authors,
+    "missing_category": no_category
   }
diff --git a/models.py b/models.py
@@ -398,6 +398,9 @@ def __init__(self, sql_entry, connection):
     self.doi = sql_entry[7]
     self.get_authors(connection)
 
+    if self.collection is None:
+      self.collection = "unknown"
+
   def json(self):
     return {
       "id": self.id,
@@ -461,6 +464,9 @@ def __init__(self, article_id, connection):
     self.publication = sql_entry[6]
     self.pub_doi = sql_entry[7]
 
+    if self.collection is None:
+      self.collection = "unknown"
+
     for author in self.authors:
       author.GetBasicInfo(connection)
 
@@ -514,6 +520,9 @@ def __init__(self, article_id, connection):
     self.doi = sql_entry[4]
     self.ranks = ArticleRanks(self.id, connection)
 
+    if self.collection is None:
+      self.collection = "unknown"
+
   def json(self):
     return {
       "id": self.id,

diff --git a/spider/models.py b/spider/models.py
@@ -75,11 +75,11 @@ class Article:
   def __init__(self):
     pass
 
-  def process_results_entry(self, html, collection, log):
+  def process_results_entry(self, html, log):
     self._find_title(html)
     self._find_url(html)
     self._find_doi(html, log)
-    self.collection = collection
+    self.collection = None
     # NOTE: We don't get abstracts from search result pages
     # because they're loaded asynchronously and it would be
     # annoying to load every one separately.
@@ -124,7 +124,7 @@ def record(self, connection, spider): # TODO: requiring the whole spider here is
           return False
         else:
           # If it's a revision
-          cursor.execute("UPDATE articles SET url=%s, title=%s, collection=%s WHERE doi=%s RETURNING id;", (self.url, self.title, self.collection, self.doi))
+          cursor.execute("UPDATE articles SET url=%s, title=%s WHERE doi=%s RETURNING id;", (self.url, self.title, self.doi))
           self.id = cursor.fetchone()[0]
           stat_table, authors = spider.get_article_stats(self.url)
           spider._record_authors(self.id, authors, True)
@@ -136,7 +136,7 @@ def record(self, connection, spider): # TODO: requiring the whole spider here is
     # If it's brand new:
     with connection.db.cursor() as cursor:
       try:
-        cursor.execute("INSERT INTO articles (url, title, doi, collection) VALUES (%s, %s, %s, %s) RETURNING id;", (self.url, self.title, self.doi, self.collection))
+        cursor.execute("INSERT INTO articles (url, title, doi) VALUES (%s, %s, %s) RETURNING id;", (self.url, self.title, self.doi))
       except Exception as e:
         spider.log.record(f"Couldn't record article '{self.title}': {e}", "error")
       self.id = cursor.fetchone()[0]
@@ -164,3 +164,24 @@ def record(self, connection, spider): # TODO: requiring the whole spider here is
       cursor.execute("UPDATE articles SET author_vector=to_tsvector(coalesce(%s,'')) WHERE id=%s;", (author_string, self.id))
       spider.log.record(f"Recorded article {self.title}")
     return True
+
+  def get_id(self, connection):
+    with connection.db.cursor() as cursor:
+      cursor.execute("SELECT id FROM articles WHERE doi=%s", (self.doi,))
+      response = cursor.fetchone()
+      if response is None or len(response) > 0:
+        return False
+      self.id = response[0]
+
+  def record_category(self, connection, log):
+    with connection.db.cursor() as cursor:
+      # check to see if we've seen this article before
+      if self.collection is None or self.id is None:
+        log.record(f"Paper {self.id} doesn't have a category, though it should. Exiting; something's wrong.", "fatal")
+      cursor.execute("SELECT category FROM articles WHERE id=%s", (self.id,))
+      response = cursor.fetchone()
+
+      if response is not None and len(response) > 0:
+        self.category = response[0]
+        cursor.execute("UPDATE articles SET collection=%s WHERE id=%s;", (self.category, self.id))
+        log.record(f"Updated collection for article {self.id}: {self.category}", "info")
diff --git a/spider/spider.py b/spider/spider.py
@@ -55,12 +55,12 @@ def determine_page_count(html):
     return int(pages[-1].text)
   return 0
 
-def pull_out_articles(html, collection, log):
+def pull_out_articles(html, log):
   entries = html.find(".highwire-article-citation")
   articles = []
   for entry in entries:
     a = models.Article()
-    a.process_results_entry(entry, collection, log)
+    a.process_results_entry(entry, log)
     articles.append(a)
   return articles
 
@@ -83,6 +83,7 @@ def __init__(self):
     self.log = Logger()
 
   def _pull_crossref_data_date(self, datestring):
+    # Datestring should be format YYYY-MM-DD
     self.log.record(f"Beginning retrieval of Crossref data for {datestring}", "info")
     # (If we have multiple results for the same 24-hour period, the
     # query that displays the most popular displays the same articles
@@ -134,23 +135,73 @@ def _pull_crossref_data_date(self, datestring):
       cursor.executemany(sql, params)
     self.log.record("Done with crossref.", "info")
 
-  def find_record_new_articles(self, collection):
+  def find_record_new_articles(self):
+    # we need to grab the first page to figure out how many pages there are
+    self.log.record(f"Fetching page 0")
+    try:
+      r = self.session.get(config.biorxiv["endpoints"]["recent"])
+    except Exception as e:
+      self.log.record(f"Error requesting first page of recent results. Retrying: {e}", "error")
+      try:
+        r = self.session.get(config.biorxiv["endpoints"]["recent"])
+      except Exception as e:
+        self.log.record(f"Error AGAIN requesting first page of results. Bailing: {e}", "error")
+        return
+
+    results = pull_out_articles(r.html, self.log)
+    consecutive_recognized = 0
+    for article in results:
+      if not article.record(self.connection, self):
+        consecutive_recognized += 1
+        if consecutive_recognized >= config.recognized_limit and config.stop_on_recognized: return
+      else:
+        consecutive_recognized = 0
+
+    for p in range(1, determine_page_count(r.html)): # iterate through each page of results
+      if config.polite:
+        time.sleep(3)
+      self.log.record(f"\n\nFetching page {p}") # pages are zero-indexed
+      try:
+        r = self.session.get("{}?page={}".format(config.biorxiv["endpoints"]["recent"], p))
+      except Exception as e:
+        self.log.record(f"Error requesting page {p} of results. Retrying: {e}", "error")
+        try:
+          r = self.session.get("{}?page={}".format(config.biorxiv["endpoints"]["recent"], p))
+        except Exception as e:
+          self.log.record(f"Error AGAIN requesting page of results: {e}", "error")
+          self.log.record("Crawling recent papers failed in the middle; unrecorded new articles are likely being skipped. Exiting to avoid losing them.", "fatal")
+          return
+
+      results = pull_out_articles(r.html, self.log)
+      for x in results:
+        if not x.record(self.connection, self):
+          consecutive_recognized += 1
+          if consecutive_recognized >= config.recognized_limit and config.stop_on_recognized: return
+        else:
+          consecutive_recognized = 0
+
+  def determine_collection(self, collection):
     # we need to grab the first page to figure out how many pages there are
     self.log.record(f"Fetching page 0 in {collection}")
     try:
       r = self.session.get(f'{config.biorxiv["endpoints"]["collection"]}/{collection}')
     except Exception as e:
-      log.record(f"Error requesting first page of results for collection. Retrying: {e}", "error")
+      self.log.record(f"Error requesting first page of results for collection. Retrying: {e}", "error")
       try:
         r = self.session.get(f'{config.biorxiv["endpoints"]["collection"]}/{collection}')
       except Exception as e:
-        log.record(f"Error AGAIN requesting first page of results for collection. Bailing: {e}", "error")
+        self.log.record(f"Error AGAIN requesting first page of results for collection. Bailing: {e}", "error")
         return
 
-    results = pull_out_articles(r.html, collection, self.log)
+    results = pull_out_articles(r.html, self.log)
     consecutive_recognized = 0
     for article in results:
-      if not article.record(self.connection, self):
+      # make sure we know about the article already:
+      known = article.get_id(self.connection)
+      if not known:
+        self.log.record(f'Encountered unknown paper in category listings: {article.doi}', 'fatal')
+
+      if not article.record_category(self.connection, self.log):
         consecutive_recognized += 1
         if consecutive_recognized >= config.recognized_limit and config.stop_on_recognized: return
       else:
@@ -171,9 +222,9 @@ def find_record_new_articles(self, collection):
           log.record("Crawling of category {} failed in the middle; unrecorded new articles are likely being skipped. Exiting to avoid losing them.", "fatal")
           return
 
-      results = pull_out_articles(r.html, collection, self.log)
+      results = pull_out_articles(r.html, self.log)
       for x in results:
-        if not x.record(self.connection, self):
+        if not x.record_category(self.connection, self.log):
           consecutive_recognized += 1
           if consecutive_recognized >= config.recognized_limit and config.stop_on_recognized: return
         else:
@@ -827,27 +878,24 @@ def load_rankings_from_file(batch, log):
   if to_delete is not None:
     os.remove(to_delete)
 
-def full_run(spider, collection=None):
-  if collection is not None:
-    spider.find_record_new_articles(collection)
+def full_run(spider):
+  if config.crawl["fetch_new"] is not False:
+    spider.find_record_new_articles()
   else:
-    spider.log.record("No collection specified, iterating through all known categories.")
-    for collection in spider.fetch_categories():
-      spider.log.record(f"\n\nBeginning category {collection}", "info")
-      if config.crawl["fetch_new"] is not False:
-        spider.find_record_new_articles(collection)
-      else:
-        spider.log.record("Skipping search for new articles: disabled in configuration file.")
-
-      if config.crawl["refresh_stats"] is not False:
-        spider.refresh_article_stats(collection, config.refresh_category_cap)
-      else:
-        spider.log.record("Skipping refresh of paper download stats: disabled in configuration file.")
+    spider.log.record("Skipping search for new articles: disabled in configuration file.")
   if config.crawl["fetch_abstracts"] is not False:
     spider.fetch_abstracts()
   else:
     spider.log.record("Skipping step to fetch unknown abstracts: disabled in configuration file.")
 
+  for collection in spider.fetch_categories():
+    spider.log.record(f"\n\nBeginning category {collection}", "info")
+    spider.determine_collection(collection)
+    if config.crawl["refresh_stats"] is not False:
+      spider.refresh_article_stats(collection, config.refresh_category_cap)
+    else:
+      spider.log.record("Skipping refresh of paper download stats: disabled in configuration file.")
+
   if config.crawl["fetch_crossref"] is not False:
     spider.pull_todays_crossref_data()
   else:
@@ -963,5 +1011,3 @@ def month_to_num(month):
       print("Must submit ID number of article to be refreshed.")
       exit(1)
     spider.refresh_article_stats(id=sys.argv[2])
-  else:
-    full_run(spider, sys.argv[1])