Skip to content
This repository has been archived by the owner on Mar 1, 2023. It is now read-only.

Commit

Permalink
Fetching articles independently of collections
Browse files Browse the repository at this point in the history
  • Loading branch information
rabdill committed Oct 29, 2018
1 parent 601c278 commit ebd2b9e
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 32 deletions.
11 changes: 9 additions & 2 deletions endpoints.py
Expand Up @@ -26,7 +26,7 @@ def get_categories(connection):
"""
results = []
categories = connection.read("SELECT DISTINCT collection FROM articles ORDER BY collection;")
categories = connection.read("SELECT DISTINCT collection FROM articles WHERE collection IS NOT NULL ORDER BY collection;")
for cat in categories:
if len(cat) > 0:
results.append(cat[0])
Expand Down Expand Up @@ -287,6 +287,12 @@ def site_stats(connection):
continue # something fishy with this entry
outdated[entry[0]] = entry[1]

resp = connection.read("SELECT COUNT(id) FROM articles WHERE collection IS NULL;")
if len(resp) != 1 or len(resp[0]) != 1:
no_category = 0
else:
no_category = resp[0][0]

resp = connection.read("""
SELECT COUNT(id)
FROM (
Expand All @@ -311,5 +317,6 @@ def site_stats(connection):
"missing_abstract": no_abstract,
"missing_date": no_posted,
"outdated_count": outdated,
"missing_authors": no_authors
"missing_authors": no_authors,
"missing_category": no_category
}
9 changes: 9 additions & 0 deletions models.py
Expand Up @@ -398,6 +398,9 @@ def __init__(self, sql_entry, connection):
self.doi = sql_entry[7]
self.get_authors(connection)

if self.collection is None:
self.collection = "unknown"

def json(self):
return {
"id": self.id,
Expand Down Expand Up @@ -461,6 +464,9 @@ def __init__(self, article_id, connection):
self.publication = sql_entry[6]
self.pub_doi = sql_entry[7]

if self.collection is None:
self.collection = "unknown"

for author in self.authors:
author.GetBasicInfo(connection)

Expand Down Expand Up @@ -514,6 +520,9 @@ def __init__(self, article_id, connection):
self.doi = sql_entry[4]
self.ranks = ArticleRanks(self.id, connection)

if self.collection is None:
self.collection = "unknown"

def json(self):
return {
"id": self.id,
Expand Down
29 changes: 25 additions & 4 deletions spider/models.py
Expand Up @@ -75,11 +75,11 @@ class Article:
def __init__(self):
pass

def process_results_entry(self, html, collection, log):
def process_results_entry(self, html, log):
self._find_title(html)
self._find_url(html)
self._find_doi(html, log)
self.collection = collection
self.collection = None
# NOTE: We don't get abstracts from search result pages
# because they're loaded asynchronously and it would be
# annoying to load every one separately.
Expand Down Expand Up @@ -124,7 +124,7 @@ def record(self, connection, spider): # TODO: requiring the whole spider here is
return False
else:
# If it's a revision
cursor.execute("UPDATE articles SET url=%s, title=%s, collection=%s WHERE doi=%s RETURNING id;", (self.url, self.title, self.collection, self.doi))
cursor.execute("UPDATE articles SET url=%s, title=%s WHERE doi=%s RETURNING id;", (self.url, self.title, self.doi))
self.id = cursor.fetchone()[0]
stat_table, authors = spider.get_article_stats(self.url)
spider._record_authors(self.id, authors, True)
Expand All @@ -136,7 +136,7 @@ def record(self, connection, spider): # TODO: requiring the whole spider here is
# If it's brand new:
with connection.db.cursor() as cursor:
try:
cursor.execute("INSERT INTO articles (url, title, doi, collection) VALUES (%s, %s, %s, %s) RETURNING id;", (self.url, self.title, self.doi, self.collection))
cursor.execute("INSERT INTO articles (url, title, doi) VALUES (%s, %s, %s) RETURNING id;", (self.url, self.title, self.doi))
except Exception as e:
spider.log.record(f"Couldn't record article '{self.title}': {e}", "error")
self.id = cursor.fetchone()[0]
Expand Down Expand Up @@ -164,3 +164,24 @@ def record(self, connection, spider): # TODO: requiring the whole spider here is
cursor.execute("UPDATE articles SET author_vector=to_tsvector(coalesce(%s,'')) WHERE id=%s;", (author_string, self.id))
spider.log.record(f"Recorded article {self.title}")
return True

def get_id(self, connection):
with connection.db.cursor() as cursor:
cursor.execute("SELECT id FROM articles WHERE doi=%s", (self.doi,))
response = cursor.fetchone()
if response is None or len(response) > 0:
return False
self.id = response[0]

def record_category(self, connection, log):
with connection.db.cursor() as cursor:
# check to see if we've seen this article before
if self.collection is None or self.id is None:
log.record(f"Paper {self.id} doesn't have a category, though it should. Exiting; something's wrong.", "fatal")
cursor.execute("SELECT category FROM articles WHERE id=%s", (self.id,))
response = cursor.fetchone()

if response is not None and len(response) > 0:
self.category = response[0]
cursor.execute("UPDATE articles SET collection=%s WHERE id=%s;", (self.category, self.id))
log.record(f"Updated collection for article {self.id}: {self.category}", "info")
98 changes: 72 additions & 26 deletions spider/spider.py
Expand Up @@ -55,12 +55,12 @@ def determine_page_count(html):
return int(pages[-1].text)
return 0

def pull_out_articles(html, collection, log):
def pull_out_articles(html, log):
entries = html.find(".highwire-article-citation")
articles = []
for entry in entries:
a = models.Article()
a.process_results_entry(entry, collection, log)
a.process_results_entry(entry, log)
articles.append(a)
return articles

Expand All @@ -83,6 +83,7 @@ def __init__(self):
self.log = Logger()

def _pull_crossref_data_date(self, datestring):
# Datestring should be format YYYY-MM-DD
self.log.record(f"Beginning retrieval of Crossref data for {datestring}", "info")
# (If we have multiple results for the same 24-hour period, the
# query that displays the most popular displays the same articles
Expand Down Expand Up @@ -134,23 +135,73 @@ def _pull_crossref_data_date(self, datestring):
cursor.executemany(sql, params)
self.log.record("Done with crossref.", "info")

def find_record_new_articles(self, collection):
def find_record_new_articles(self):
# we need to grab the first page to figure out how many pages there are
self.log.record(f"Fetching page 0")
try:
r = self.session.get(config.biorxiv["endpoints"]["recent"])
except Exception as e:
self.log.record(f"Error requesting first page of recent results. Retrying: {e}", "error")
try:
r = self.session.get(config.biorxiv["endpoints"]["recent"])
except Exception as e:
self.log.record(f"Error AGAIN requesting first page of results. Bailing: {e}", "error")
return

results = pull_out_articles(r.html, self.log)
consecutive_recognized = 0
for article in results:
if not article.record(self.connection, self):
consecutive_recognized += 1
if consecutive_recognized >= config.recognized_limit and config.stop_on_recognized: return
else:
consecutive_recognized = 0

for p in range(1, determine_page_count(r.html)): # iterate through each page of results
if config.polite:
time.sleep(3)
self.log.record(f"\n\nFetching page {p}") # pages are zero-indexed
try:
r = self.session.get("{}?page={}".format(config.biorxiv["endpoints"]["recent"], p))
except Exception as e:
self.log.record(f"Error requesting page {p} of results. Retrying: {e}", "error")
try:
r = self.session.get("{}?page={}".format(config.biorxiv["endpoints"]["recent"], p))
except Exception as e:
self.log.record(f"Error AGAIN requesting page of results: {e}", "error")
self.log.record("Crawling recent papers failed in the middle; unrecorded new articles are likely being skipped. Exiting to avoid losing them.", "fatal")
return

results = pull_out_articles(r.html, self.log)
for x in results:
if not x.record(self.connection, self):
consecutive_recognized += 1
if consecutive_recognized >= config.recognized_limit and config.stop_on_recognized: return
else:
consecutive_recognized = 0

def determine_collection(self, collection):
# we need to grab the first page to figure out how many pages there are
self.log.record(f"Fetching page 0 in {collection}")
try:
r = self.session.get(f'{config.biorxiv["endpoints"]["collection"]}/{collection}')
except Exception as e:
log.record(f"Error requesting first page of results for collection. Retrying: {e}", "error")
self.log.record(f"Error requesting first page of results for collection. Retrying: {e}", "error")
try:
r = self.session.get(f'{config.biorxiv["endpoints"]["collection"]}/{collection}')
except Exception as e:
log.record(f"Error AGAIN requesting first page of results for collection. Bailing: {e}", "error")
self.log.record(f"Error AGAIN requesting first page of results for collection. Bailing: {e}", "error")
return

results = pull_out_articles(r.html, collection, self.log)
results = pull_out_articles(r.html, self.log)
consecutive_recognized = 0
for article in results:
if not article.record(self.connection, self):
# make sure we know about the article already:
known = article.get_id(self.connection)
if not known:
self.log.record(f'Encountered unknown paper in category listings: {article.doi}', 'fatal')

if not article.record_category(self.connection, self.log):
consecutive_recognized += 1
if consecutive_recognized >= config.recognized_limit and config.stop_on_recognized: return
else:
Expand All @@ -171,9 +222,9 @@ def find_record_new_articles(self, collection):
log.record("Crawling of category {} failed in the middle; unrecorded new articles are likely being skipped. Exiting to avoid losing them.", "fatal")
return

results = pull_out_articles(r.html, collection, self.log)
results = pull_out_articles(r.html, self.log)
for x in results:
if not x.record(self.connection, self):
if not x.record_category(self.connection, self.log):
consecutive_recognized += 1
if consecutive_recognized >= config.recognized_limit and config.stop_on_recognized: return
else:
Expand Down Expand Up @@ -827,27 +878,24 @@ def load_rankings_from_file(batch, log):
if to_delete is not None:
os.remove(to_delete)

def full_run(spider, collection=None):
if collection is not None:
spider.find_record_new_articles(collection)
def full_run(spider):
if config.crawl["fetch_new"] is not False:
spider.find_record_new_articles()
else:
spider.log.record("No collection specified, iterating through all known categories.")
for collection in spider.fetch_categories():
spider.log.record(f"\n\nBeginning category {collection}", "info")
if config.crawl["fetch_new"] is not False:
spider.find_record_new_articles(collection)
else:
spider.log.record("Skipping search for new articles: disabled in configuration file.")

if config.crawl["refresh_stats"] is not False:
spider.refresh_article_stats(collection, config.refresh_category_cap)
else:
spider.log.record("Skipping refresh of paper download stats: disabled in configuration file.")
spider.log.record("Skipping search for new articles: disabled in configuration file.")
if config.crawl["fetch_abstracts"] is not False:
spider.fetch_abstracts()
else:
spider.log.record("Skipping step to fetch unknown abstracts: disabled in configuration file.")

for collection in spider.fetch_categories():
spider.log.record(f"\n\nBeginning category {collection}", "info")
spider.determine_collection(collection)
if config.crawl["refresh_stats"] is not False:
spider.refresh_article_stats(collection, config.refresh_category_cap)
else:
spider.log.record("Skipping refresh of paper download stats: disabled in configuration file.")

if config.crawl["fetch_crossref"] is not False:
spider.pull_todays_crossref_data()
else:
Expand Down Expand Up @@ -963,5 +1011,3 @@ def month_to_num(month):
print("Must submit ID number of article to be refreshed.")
exit(1)
spider.refresh_article_stats(id=sys.argv[2])
else:
full_run(spider, sys.argv[1])

0 comments on commit ebd2b9e

Please sign in to comment.