Uses Fine-Tuned BERT network to classify biomechanics papers from PubMed

In [99]:
try:
  from official.nlp import optimization
except:
  !pip install -q -U tf-models-official
  from official.nlp import optimization
try:
  from Bio import Entrez
except:
  !pip install -q -U biopython
  from Bio import Entrez
try:
  import tensorflow_text as text
except:
  !pip install -q -U tensorflow_text
  import tensorflow_text as text

import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import tensorflow as tf
import string
import datetime
from bs4 import BeautifulSoup
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import load_model
import tensorflow_hub as hub
from google.colab import drive
import datetime as dt
today = dt.date.today()
week_ago = today - dt.timedelta(days=7)

drive.mount('/content/gdrive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [107]:

# Define Search Criteria ----
def search(query):
    Entrez.email = 'your.email@example.com'
    handle = Entrez.esearch(db='pubmed',
                            sort='most recent',
                            retmax='5000',
                            retmode='xml',
                            datetype='pdat',
                            reldate=7,  # only within n days from now
                            # mindate='2019/03/25',
                            # maxdate='2019/03/27',  # for searching date range
                            term=query)
    results = Entrez.read(handle)
    return results


# search terms (can test string with Pubmed Advanced Search)
# search_results = search('(Biomech*[Title/Abstract] OR locomot*[Title/Abstract])')
search_results = search('(biomech*[Title/Abstract] OR locomot*[Title/Abstract] NOT opiod*[Title/Abstract] NOT pharm*[Journal] NOT mice[Title/Abstract] NOT rats[Title/Abstract])')

# Perform Search and Save Paper Titles ----
def fetch_details(ids):
    Entrez.email = 'your.email@example.com'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results


id_list = search_results['IdList']
papers = fetch_details(id_list)
print("")

# Definitely could change these loops for speed.
papers_length = len(papers['PubmedArticle'])
titles = [None] * papers_length
full_titles = [None] * papers_length
keywords = [None] * papers_length
authors = [None] * papers_length
links = [None] * papers_length
journals = [None] * papers_length
abstracts = [None] * papers_length


def clean_str(text, stops):
    text = BeautifulSoup(text, 'lxml').text
    text = text.split()
    return ' '.join([word for word in text if word not in stops])


# Make the Stop Words for string cleaning
stop = list(stopwords.words('english'))
stop_c = [string.capwords(word) for word in stop]
for word in stop_c:
    stop.append(word)
stop.append('The')
stop.append('An')
stop.append('A')
stop.append('Do')
stop.append('Is')
stop.append('In')
new_stop = ['StringElement', 'NlmCategory', 'Label', 'attributes', 'INTRODUCTION',
            'METHODS', 'BACKGROUND', 'RESULTS', 'CONCLUSIONS']
for s in new_stop:
    stop.append(s)

for i, paper in enumerate(papers['PubmedArticle']):
    titles[i] = clean_str(papers['PubmedArticle'][i]['MedlineCitation']['Article']['ArticleTitle'], stop)
    full_titles[i] = papers['PubmedArticle'][i]['MedlineCitation']['Article']['ArticleTitle']
    try:
        abstracts[i] = \
            clean_str(papers['PubmedArticle'][i]['MedlineCitation']['Article']['Abstract']['AbstractText'][0], stop)
    except:
        abstracts[i] = ''
print(np.size(titles), 'Papers found')

# Pull information from PubMed Results ----
# Format title, journal, authors in markdown friendly manner

for i, paper in enumerate(papers['PubmedArticle']):
    if paper['MedlineCitation']['Article']['ArticleTitle'] == '':
        continue
    if paper['MedlineCitation']['Article']['ArticleTitle'][0] == '[':
        links[i] = '[URL="https://www.ncbi.nlm.nih.gov/pubmed/%s"]%s[/URL]' % \
                   (paper['MedlineCitation']['PMID'],
                    BeautifulSoup(paper['MedlineCitation']['Article']['ArticleTitle'][1:-1], 'lxml').text
                    )
    else:
        links[i] = '[URL="https://www.ncbi.nlm.nih.gov/pubmed/%s"]%s[/URL]' % \
                   (paper['MedlineCitation']['PMID'],
                    BeautifulSoup(paper['MedlineCitation']['Article']['ArticleTitle'], 'lxml').text
                   )

    auths = []
    try:
        for auth in paper['MedlineCitation']['Article']['AuthorList']:
            try:
                auth_name = [auth['LastName'], auth['Initials'] + ',']
                auth_name = ' '.join(auth_name)
                auths.append(auth_name)
            except:
                auths.append('')
                print(paper['MedlineCitation']['Article']['ArticleTitle'],
                      'has an issue with an author name')
    except:
        auths.append('AUTHOR NAMES ERROR')
        print(paper['MedlineCitation']['Article']['ArticleTitle'], 'has no author list?')
    authors[i] = ' '.join(auths)
    journals[i] = '%s' % (paper['MedlineCitation']['Article']['Journal']['Title'])
    # store keywords 
    if paper['MedlineCitation']['KeywordList'] != []:
        kwds = []
        for kw in paper['MedlineCitation']['KeywordList'][0]:
            kwds.append(kw[:])
        keywords[i] = ' '.join(kwds)

# Clean up title and word strings ----
titles = [t.lower() for t in titles]  # same case
titles = [t.replace('<sub>', ' ').replace('</sub>', '') for t in titles]  # subscript
titles = [t.replace('<i>', ' ').replace('</i>', '') for t in titles]  # italics
titles = [t.replace('[', '').replace(']', '') for t in titles]  # remove brackets from html parser
# clean up keywords
keywords2 = []
for k in keywords:
    if k is None:
        keywords2.append('')
    else:
        keywords2.append(k.lower())
keywords = keywords2
# keywords = [k.lower() for k in keywords] #same case


# Loading the Network ----
# Load Fine-Tuned BERT model
model = tf.saved_model.load('/content/gdrive/My Drive/Biomech_Lit_Up/literature_update/Data/BERT32/')
print('Loaded model from disk')

# Load Label Encoder
le = LabelEncoder()
le.classes_ = np.load('/content/gdrive/My Drive/Biomech_Lit_Up/literature_update/Data/BERT_label_encoder.npy')
print('Loaded Label Encoder')

# get titles for this week's literature update
papers_df = pd.DataFrame({'title': titles,
                          'keywords': keywords,
                          'abstract': abstracts,
                          'author': authors,
                          'journal': journals})

for index, row in papers_df.iterrows():
    if row['title'] == '' or row['author'] == 'AUTHOR NAMES ERROR':
    # if row['abstract'] == '' or row['author'] == 'AUTHOR NAMES ERROR' or row['title'] == '':

        papers_df.drop(index, inplace=True)

# join titles and abstract
papers_df['everything'] = pd.DataFrame(papers_df['title'].astype(str) + papers_df['abstract'].astype(str))



80 Papers found
Biomechanical properties of a novel locking compression plate to stabilize oblique tibial osteotomies in buffaloes. has an issue with an author name
Loaded model from disk
Loaded Label Encoder


In [108]:
predicted_topic = model(papers_df['everything'], training=False)  # will run out of GPU memory if predicting more than ~2000 title+abstracts



In [109]:
topics = []
pred_val = []
pred_val_vec = []
title_temp = []
indx = []

for k, top_val in enumerate(predicted_topic):
    if k in papers_df.index:
        pred_val = np.max(top_val)
        if pred_val > 1.5 * np.sort(top_val)[-2]:
            indx.append(k)
            topics.append(le.inverse_transform([np.argmax(top_val)])[0])
            title_temp.append(papers_df['title'][k])
            top1 = le.inverse_transform([np.argmax(top_val)])[0]
            top2 = le.inverse_transform([list(top_val).index([np.sort(top_val)[-2]])])[0]
            # pred_val_vec.append(pred_val * 100)  # just report top category
            pred_val_vec.append(str(np.round(pred_val * 100, 1)) + '% ' + str(top1) + '; ' + str(
                np.round(np.sort(top_val)[-2] * 100, 1)) + '% ' + str(top2))  # report top 2 categories
        else:
            indx.append(k)
            topics.append('unknown')
            title_temp.append(papers_df['title'][k])
            top1 = le.inverse_transform([np.argmax(top_val)])[0]
            top2 = le.inverse_transform([list(top_val).index([np.sort(top_val)[-2]])])[0]
            pred_val_vec.append(str(np.round(pred_val * 100, 1)) + '% ' + str(top1) + '; ' + str(
                np.round(np.sort(top_val)[-2] * 100, 1)) + '% ' + str(top2))
    else:
        print('Skipping prediction of paper #: ' + str(k))
papers_df = pd.DataFrame(data={'title': title_temp,
                               'topic': topics,
                               'pred_val': pred_val_vec})


In [110]:
# Save Titles and Topics ----

# add info for github markdown format
papers_df['title'] = [title if title[1] is not '[' else title[1:-1] for title in papers_df['title']]
papers_df['authors'] = [authors[k] if authors[k][1] is not '[' else authors[1:-1] for k in indx]
papers_df['journal'] = [journals[k] for k in indx]
papers_df['links'] = [links[k] for k in indx]
papers_df['full_title'] = [full_titles[k] for k in indx]
# generate filename
now = datetime.datetime.now()
strings = [str(now.year), str(now.month), str(now.day), 'litupdate.csv']
fname = '/content/gdrive/My Drive/Biomech_Lit_Up/literature_update/Literature_Updates/' + '-'.join(strings)
strings = [str(now.year), str(now.month), str(now.day), 'litupdate.md']
mdname = '/content/gdrive/My Drive/Biomech_Lit_Up/literature_update/Literature_Updates/' + '-'.join(strings)
strings = [str(now.year), str(now.month), str(now.day), 'litupdate']
urlname = '-'.join(strings)

print('Filename: ', mdname)

Filename:  /content/gdrive/My Drive/Biomech_Lit_Up/literature_update/Literature_Updates/2020-12-17-litupdate.md


In [111]:
# Create Text File for Biomch-L ----
# Compile papers grouped by topic
txtname = '/content/gdrive/My Drive/Biomech_Lit_Up/literature_update/Literature_Updates/' + '-'.join([str(now.year), str(now.month), str(now.day), 'litupdate.txt'])
txt = open(txtname, 'w', encoding='utf-8')
txt.write('LITERATURE UPDATE\n')
txt.write(week_ago.strftime("%b %d") + ' - '+ today.strftime("%b %d, %Y")+'\n')
txt.write(
    """
Literature search terms: biomech* & locomot*

Publications are classified by [URL="https://www.github.com/alcantarar/literature_update"]BiomchBERT[/URL], a neural network trained on past Biomch-L Literature Updates. BiomchBERT is managed by [URL="https://www.twitter.com/Ryan_Alcantara_"]Ryan Alcantara[/URL], a PhD Candidate at the University of Colorado Boulder. Each publication has a score (out of 100%) reflecting how confident BiomchBERT is that the publication belongs in a particular category (top 2 shown). If something doesn't look right, email [EMAIL="biomchBERT@gmail.com"]biomchBERT@gmail.com[/EMAIL].

[URL="https://www.ryan-alcantara.com"]www.ryan-alcantara.com[/URL]. 

*********************NOTE*********************
- Not all articles have a DOI.
- Some DOI links may not yet be available online.
- Articles with no volume, issue or page numbers indicate that the article has not been published in paper form yet, but may be available in electronic form through the publisher


    """
    )

# Write papers to text file grouped by topic ----
topic_list = np.unique(papers_df.sort_values('topic')['topic'])

for topic in topic_list:
    papers_subset = pd.DataFrame(papers_df[papers_df.topic == topic].reset_index(drop=True))
    txt.write('\n')
    # TOPIC NAME (with some cleaning)
    if topic == '_REMOVE_':
      continue
    elif topic == 'unknown':
        txt.write('[SIZE=16px][B]*Papers BiomchBERT is unsure how to classify*[/B][/SIZE]\n')
    elif topic == 'CARDIOVASCULAR/CARDIOPULMONARY':
      topic = 'CARDIOVASCULAR/PULMONARY'
      txt.write('[SIZE=16px][B]*%s*[/B][/SIZE]\n' % topic)
    elif topic == 'CELLULAR/SUBCELLULAR':
      topic = 'CELLULAR'
      txt.write('[SIZE=16px][B]*%s*[/B][/SIZE]\n' % topic)
    elif topic == 'ORTHOPAEDICS/SURGERY':
      topic = 'ORTHOPAEDICS (SURGERY)'
      txt.write('[SIZE=16px][B]*%s*[/B][/SIZE]\n' % topic)
    elif topic == 'ORTHOPAEDICS/SPINE':
      topic = 'ORTHOPAEDICS (SPINE)'
      txt.write('[SIZE=16px][B]*%s*[/B][/SIZE]\n' % topic)
    else:
        txt.write('[SIZE=16px][B]*%s*[/B][/SIZE]\n' % topic)
    # HYPERLINKED PAPERS, AUTHORS, JOURNAL NAME
    for i, paper in enumerate(papers_subset['links']):
        txt.write('[B]%s[/B] ' % paper)
        txt.write('%s ' % papers_subset['authors'][i])
        txt.write('[I]%s[/I]. ' % papers_subset['journal'][i])
        # CONFIDENCE SCORE (BERT softmax categorical crossentropy)
        try:
            txt.write('(%.1f%%) \n\n' % papers_subset['pred_val'][i])
        except:
            txt.write('(%s)\n\n' % papers_subset['pred_val'][i]) 

txt.close()
print('Literature Update Exported for Biomch-L')
print('Location:', txtname)

Literature Update Exported for Biomch-L
Location: /content/gdrive/My Drive/Biomech_Lit_Up/literature_update/Literature_Updates/2020-12-17-litupdate.txt


In [6]:
# # Create Markdown for ryan-alcantara.com ----
# # Compile papers grouped by topic
# md_file = open(mdname, 'w', encoding='utf-8')
# md_file.write('---\n')
# md_file.write('layout: single\n')
# md_file.write('title: Biomechanics Literature Update\n')
# md_file.write('collection: literature\n')
# md_file.write('permalink: /literature/%s\n' % urlname)
# md_file.write('excerpt: <br>\n')
# md_file.write('toc: true\n')
# md_file.write('toc_sticky: true\n')
# md_file.write('toc_label: Topics\n')
# md_file.write('---\n')

# # tidy up topic strings
# topic_list = np.unique(papers_df.sort_values('topic')['topic'])
# ss = [s for s in topic_list if 'UNIQUE' in s]
# for i, t in enumerate(topic_list):
#     if 'UNIQUE' in t:
#         topic_list[i] = 'UNIQUE TOPIC'
#         print('Assigned unique topic: ' + str(i))
#     if 'IMPACT' in t:
#         topic_list[i] = 'TRAUMA/IMPACT'


# # Make Markdown File ----
# st = '### Created by: [Ryan Alcantara](https://twitter.com/Ryan_Alcantara_)'
# st = st + ' & [Gary Bruening](https://twitter.com/garebearbru) -'
# st = st + ' University of Colorado Boulder\n\n'
# md_file.write(st)
# for topic in topic_list:
#     papers_subset = pd.DataFrame(papers_df[papers_df.topic == topic].reset_index(drop=True))
#     md_file.write('----\n')
#     if topic == 'unknown':
#         md_file.write('# %s: Num=%i\n' % (topic, len(papers_subset)))
#     else:
#         md_file.write('# %s\n' % topic)
#     md_file.write('----\n')
#     md_file.write('\n')
#     md_file.write('[Back to top](#created-by-ryan-alcantara--gary-bruening---university-of-colorado-boulder)')
#     md_file.write('\n')
#     for i, paper in enumerate(papers_subset['links']):
#         md_file.write('%s\n' % paper)
#         md_file.write('%s\n' % papers_subset['authors'][i])
#         md_file.write('%s.  \n' % papers_subset['journal'][i])
#         try:
#             md_file.write('(%.1f%%) \n' % papers_subset['pred_val'][i])
#         except:
#             md_file.write('%s\n' % papers_subset['pred_val'][i])
#         md_file.write('\n')

# md_file.close()
# print('Literature Update Exported as Markdown')
# print('Location:', mdname)

Literature Update Exported as Markdown
Location: /content/gdrive/My Drive/Biomech_Lit_Up/literature_update/Literature_Updates/2020-12-17-litupdate.md
