In [1]:
#@title Scrape paper informations
%%capture
!pip -q install pymed==0.8.9
!pip -q install paperscraper==0.2.10
!pip -q install arxivscraper
!pip -q install rich

import arxivscraper, textwrap, json, torch
import numpy as np
import pandas as pd
from datetime import date, datetime, timedelta
from paperscraper.get_dumps import biorxiv, medrxiv, chemrxiv
from pymed import PubMed
from huggingface_hub import notebook_login
notebook_login()
import warnings
warnings.filterwarnings("ignore")

# date handling
def format_date(date, sep):
  assert len(sep) == 1
  return sep.join([date.strftime(f"%{x}") for x in "Ymd"])

def format_dates(dates, sep):
  return [format_date(d, sep) for d in dates]

def dates_between(start_date, end_date):
  """
  Returns a list of all dates between two dates (inclusive).
  """
  dates = []
  current_date = start_date
  while current_date <= end_date:
    dates.append(current_date)
    current_date += timedelta(days=1)
  return dates

#get the last date when biblio was assessed
with open("/content/drive/MyDrive/WIP/Biblio/biblio_date.json", "r") as f:
  date_dict = json.load(f)
date_start_str = date_dict['year']+ "/" + date_dict['month']+ "/" + date_dict['day']
date_start_datetime = datetime.strptime(date_start_str, "%Y/%m/%d")

#rXiv dates
start_rxivs = format_date(date_start_datetime,  "-")
end_rxivs = format_date(date.today() - timedelta(days = 1), "-")

#pubmed dates
dates = dates_between(date_start_datetime.date(), date.today() - timedelta(days = 1))
pubmed_days = [format_date(d, "/") for d in dates]





In [2]:
#@title Download paper dumps
#Download paper list from rXivs:
print("medRxiv:")
medrxiv(begin_date=start_rxivs, end_date=end_rxivs, save_path="medrxiv.jsonl")
print("bioRxiv:")
biorxiv(begin_date=start_rxivs, end_date=end_rxivs, save_path="biorxiv.jsonl")
print("chemRxiv:")
chemrxiv(begin_date=start_rxivs, end_date=end_rxivs, save_path="chemrxiv.jsonl")

#scrape the arxiv q-bio papers
print("arXiv:")
scraper = arxivscraper.Scraper(category='q-bio', date_from=start_rxivs, date_until=end_rxivs)
output = scraper.scrape()
cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors')
df_arxiv = pd.DataFrame(output,columns=cols)

medRxiv:


22it [00:12,  1.79it/s]


bioRxiv:


121it [00:43,  2.77it/s]


chemRxiv:


29it [00:08,  3.24it/s]
100%|██████████| 31/31 [00:00<00:00, 4604.88it/s]


arXiv:
fetching up to  1000 records...
fetching is completed in 2.3 seconds.
Total number of records 17


In [3]:
#@title Load classifier and tokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

id2label = {0: "Not Relevant", 1: "ML Protein Engineering", 2: "Meiosis", 3: "Chromatin", 4: "Microscopy", 5: "ML Code", 6: "Genome Engineering", 7: "Protein Engineering"}
label2id = {"Not Relevant": 0, "ML Protein Engineering": 1, "Meiosis": 2, "Chromatin": 3, "Microscopy": 4, "ML Code": 5, "Genome Engineering": 6, "Protein Engineering": 7}
tokenizer = AutoTokenizer.from_pretrained("batroelens/PubMed_interests")
model = AutoModelForSequenceClassification.from_pretrained("batroelens/PubMed_interests2", num_labels=8, id2label=id2label, label2id=label2id).to("cuda")

def preprocess_function(abstract):
    return tokenizer(abstract, truncation=True, max_length=512, return_tensors='pt').to("cuda")

data = {"Title": [], "Abstract": [], "Class": [], "Journal": [], "URL": []}

for jsonfile in ["medrxiv.jsonl", "biorxiv.jsonl", "chemrxiv.jsonl"]:
  with open(jsonfile) as infile:
    for line in infile:
      l = json.loads(line)
      abstract = l["abstract"].replace("\n", " ")
      ab = preprocess_function(abstract)
      output = model(**ab)
      predicted_class = output.logits.argmax().item()

      data["Title"].append(l["title"])
      data["Abstract"].append(abstract)
      data["Class"].append(predicted_class)
      data["Journal"].append(jsonfile.split(".")[0])
      data["URL"].append("https://doi.org/"+l["doi"])

data_pm = {"Title": [], "Abstract": [], "Class": [], "Journal": [], "URL": []}
pm_fail = False

try:
  for pubmed_date in pubmed_days:
    pubmed = PubMed(tool="MyTool", email="bla@bla.bla")
    search_query = f"{pubmed_date}[PDAT]"
    results = pubmed.query(search_query, max_results=500)
    errors = []
    for i, article in enumerate(results):
      if article.abstract is None:
        continue
      ab = preprocess_function(article.abstract)
      output = model(**ab)
      predicted_class = output.logits.argmax().item()
      data_pm["Title"].append(article.title)
      data_pm["Abstract"].append(article.abstract.replace("\n", " "))
      data_pm["Class"].append(predicted_class)
      try:
        data_pm["Journal"].append(article.journal.strip().replace("\n", " "))
      except:
        data_pm["Journal"].append("NA")
      data_pm["URL"].append("https://doi.org/"+ article.doi.split("\n")[0])
except:
  pm_fail = True
  print("Pubmed analysis failed")

for index, row in df_arxiv.iterrows():
  ab = preprocess_function(df_arxiv.loc[index, 'abstract'])
  output = model(**ab)
  predicted_class = output.logits.argmax().item()
  data["Title"].append(df_arxiv.loc[index, 'title'])
  data["Abstract"].append(df_arxiv.loc[index, 'abstract'].replace("\n", " "))
  data["Class"].append(predicted_class)
  data["Journal"].append('arXiv')
  data["URL"].append("https://doi.org/"+ df_arxiv.loc[index, 'doi'])

tokenizer_config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/706k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [4]:
#@title Display relevant papers
from rich.console import Console
console = Console()

if not pm_fail:
  for key, value in data.items():
    data[key] = value + data_pm[key]

papers = pd.DataFrame.from_dict(data)
papers = papers.sort_values("Class", ascending=False)
selected_papers = papers[papers["Class"] >= 1]

for j in range(1,8):
  current_papers = selected_papers[selected_papers["Class"] == j]
  console.print(f"PAPERS on: {id2label[j]}")
  console.print("\n")
  i=1
  for _, row in current_papers.iterrows():
    Class, title, journal, abstract, url =  row["Class"], row["Title"], row["Journal"], row["Abstract"], row["URL"]
    console.print(f"{i}- [bold]{title}[/bold] \n [italic]{journal}[/italic] \n [cyan]{url}[/cyan]")
    i += 1
  console.print("\n")
  console.print("\n")

In [5]:
#@title Format for adding to TSV for subsequent days.
#@markdown Manually set the value at the start of each row to one to indicate a positive example.
for _, row in selected_papers.iterrows():
  prob, title, journal, abstract =  row["Class"], row["Title"], row["Journal"], row["Abstract"]
  print(f"{int(round(prob))} \t{title} \t{abstract}")

3 	Mapping the Human Proteome with Physical Access to DNA 	In a human cell, DNA is packed in histones, RNA, and chromatin-associated proteins, forming a cohesive gel. At any given moment, only a specific subset of the proteome has physical access to the DNA and organizes its structure, transcription, replication, repair and other molecular functions essential to the way the genome is read and maintained. We have developed a zero-distance photo-crosslinking approach to quantify proteins in direct contact with DNA in living cells. Collecting DNA interactomes from human breast cancer cells, we present an atlas of over one thousand proteins with physical access to DNA, and hundreds of peptide-nucleotide crosslinks pinpointing protein-DNA interfaces with single amino-acid resolution. Differential comparisons of DNA interactomes from cells undergoing treatment with estrogen or genotoxic chemotherapy recapitulated the recruitment of key transcription factors and DNA damage proteins. This open

In [6]:
#@title update stored date in json and disconnect runtime
from datetime import date, datetime, timedelta

today = date.today()
new_date = {'year': str(today.strftime("%Y")), 'month': str(today.strftime("%m")), 'day': str(today.strftime("%d"))}


json_date = '/content/drive/MyDrive/WIP/Biblio/biblio_date.json'
with open(json_date, "w") as f:
  json.dump(new_date, f, indent=4)


from google.colab import runtime
runtime.unassign()