In [1]:
#@title Scrape paper informations
%%capture
!pip -q install pymed==0.8.9
!pip -q install paperscraper==0.2.10
!pip -q install arxivscraper
!pip -q install rich

import arxivscraper, textwrap, json, torch
import numpy as np
import pandas as pd
from datetime import date, datetime, timedelta
from paperscraper.get_dumps import biorxiv, medrxiv, chemrxiv
from pymed import PubMed
from huggingface_hub import notebook_login
notebook_login()
import warnings
warnings.filterwarnings("ignore")

# date handling
def format_date(date, sep):
  assert len(sep) == 1
  return sep.join([date.strftime(f"%{x}") for x in "Ymd"])

def format_dates(dates, sep):
  return [format_date(d, sep) for d in dates]

def dates_between(start_date, end_date):
  """
  Returns a list of all dates between two dates (inclusive).
  """
  dates = []
  current_date = start_date
  while current_date <= end_date:
    dates.append(current_date)
    current_date += timedelta(days=1)
  return dates

#get the last date when biblio was assessed
with open("/content/drive/MyDrive/WIP/Biblio/biblio_date.json", "r") as f:
  date_dict = json.load(f)
date_start_str = date_dict['year']+ "/" + date_dict['month']+ "/" + date_dict['day']
date_start_datetime = datetime.strptime(date_start_str, "%Y/%m/%d")

#rXiv dates
start_rxivs = format_date(date_start_datetime,  "-")
end_rxivs = format_date(date.today() - timedelta(days = 1), "-")

#pubmed dates
dates = dates_between(date_start_datetime.date(), date.today() - timedelta(days = 1))
pubmed_days = [format_date(d, "/") for d in dates]





In [2]:
#@title Download paper dumps
#Download paper list from rXivs:
print("medRxiv:")
medrxiv(begin_date=start_rxivs, end_date=end_rxivs, save_path="medrxiv.jsonl")
print("bioRxiv:")
biorxiv(begin_date=start_rxivs, end_date=end_rxivs, save_path="biorxiv.jsonl")
print("chemRxiv:")
chemrxiv(begin_date=start_rxivs, end_date=end_rxivs, save_path="chemrxiv.jsonl")

#scrape the arxiv q-bio papers
print("arXiv:")
scraper = arxivscraper.Scraper(category='q-bio', date_from=start_rxivs, date_until=end_rxivs)
output = scraper.scrape()
cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors')
df_arxiv = pd.DataFrame(output,columns=cols)

medRxiv:


107it [00:17,  6.29it/s]


bioRxiv:


382it [01:07,  5.70it/s]


chemRxiv:


58it [00:43,  1.35it/s]
100%|██████████| 60/60 [00:00<00:00, 4455.23it/s]


arXiv:
fetching up to  1000 records...
fetching is completed in 2.5 seconds.
Total number of records 54


In [6]:
#@title Load classifier and tokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

id2label = {0: "Not Relevant", 1: "Potentially Interesting"}
label2id = {"Not Relevant": 0, "Potentially Interesting": 1}
tokenizer = AutoTokenizer.from_pretrained("batroelens/PubMed_interests")
model = AutoModelForSequenceClassification.from_pretrained("batroelens/PubMed_interests", num_labels=2, id2label=id2label, label2id=label2id).to("cuda")

def preprocess_function(abstract):
    return tokenizer(abstract, truncation=True, max_length=512, return_tensors='pt').to("cuda")

data = {"Title": [], "Abstract": [], "Relevance": [], "Journal": [], "URL": []}

for jsonfile in ["medrxiv.jsonl", "biorxiv.jsonl", "chemrxiv.jsonl"]:
  with open(jsonfile) as infile:
    for line in infile:
      l = json.loads(line)
      abstract = l["abstract"].replace("\n", " ")
      ab = preprocess_function(abstract)
      output = model(**ab)
      probabilities = torch.softmax(output.logits, dim=1)
      class_probabilities = probabilities[0].tolist()
      data["Title"].append(l["title"])
      data["Abstract"].append(abstract)
      data["Relevance"].append(class_probabilities[1])
      data["Journal"].append(jsonfile.split(".")[0])
      data["URL"].append("https://doi.org/"+l["doi"])

data_pm = {"Title": [], "Abstract": [], "Relevance": [], "Journal": [], "URL": []}
pm_fail = False

try:
  for pubmed_date in pubmed_days:
    pubmed = PubMed(tool="MyTool", email="bla@bla.bla")
    search_query = f"{pubmed_date}[PDAT]"
    results = pubmed.query(search_query, max_results=500)
    errors = []
    for i, article in enumerate(results):
      if article.abstract is None:
        continue
      ab = preprocess_function(article.abstract)
      output = model(**ab)
      probabilities = torch.softmax(output.logits, dim=1)
      class_probabilities = probabilities[0].tolist()
      data_pm["Title"].append(article.title)
      data_pm["Abstract"].append(article.abstract.replace("\n", " "))
      data_pm["Relevance"].append(class_probabilities[1])
      try:
        data_pm["Journal"].append(article.journal.strip().replace("\n", " "))
      except:
        data_pm["Journal"].append("NA")
      data_pm["URL"].append("https://doi.org/"+ article.doi.split("\n")[0])
except:
  pm_fail = True
  print("Pubmed analysis failed")

for index, row in df_arxiv.iterrows():
  ab = preprocess_function(df_arxiv.loc[index, 'abstract'])
  output = model(**ab)
  probabilities = torch.softmax(output.logits, dim=1)
  class_probabilities = probabilities[0].tolist()
  data["Title"].append(df_arxiv.loc[index, 'title'])
  data["Abstract"].append(df_arxiv.loc[index, 'abstract'].replace("\n", " "))
  data["Relevance"].append(class_probabilities[1])
  data["Journal"].append('arXiv')
  data["URL"].append("https://doi.org/"+ df_arxiv.loc[index, 'doi'])

Pubmed analysis failed


In [7]:
#@title Display relevant papers
from rich.console import Console
console = Console()

if not pm_fail:
  for key, value in data.items():
    data[key] = value + data_pm[key]

papers = pd.DataFrame.from_dict(data)
papers = papers.sort_values("Relevance", ascending=False)
selected_papers = papers[papers["Relevance"] >= 0.8]

i=1
for _, row in selected_papers.iterrows():
  prob, title, journal, abstract, url =  row["Relevance"], row["Title"], row["Journal"], row["Abstract"], row["URL"]
  console.print(f"{i}- [bold]{title}[/bold] - [italic]{prob}[/italic] \n [italic]{journal}[/italic] \n [cyan]{url}[/cyan]")
  i += 1

In [None]:
#@title Format for adding to TSV for subsequent days.
#@markdown Manually set the value at the start of each row to one to indicate a positive example.
for _, row in selected_papers.iterrows():
  prob, title, journal, abstract =  row["Relevance"], row["Title"], row["Journal"], row["Abstract"]
  print(f"{int(round(prob))} \t{title} \t{abstract}")

In [None]:
#@title update stored date in json and disconnect runtime
from datetime import date, datetime, timedelta

today = date.today()
new_date = {'year': str(today.strftime("%Y")), 'month': str(today.strftime("%m")), 'day': str(today.strftime("%d"))}


json_date = '/content/drive/MyDrive/WIP/Biblio/biblio_date.json'
with open(json_date, "w") as f:
  json.dump(new_date, f, indent=4)


from google.colab import runtime
runtime.unassign()