Uses Fine-Tuned BERT network to classify biomechanics papers from PubMed

In [1]:
!rm /etc/localtime
!ln -s /usr/share/zoneinfo/America/Denver /etc/localtime
!date
# might need to restart runtime if timezone didn't change

Thu Jan 14 12:18:08 MST 2021


In [2]:
## Install & load libraries
try:
  from official.nlp import optimization
except:
  !pip install -q -U tf-models-official
  from official.nlp import optimization
try:
  from Bio import Entrez
except:
  !pip install -q -U biopython
  from Bio import Entrez
try:
  import tensorflow_text as text
except:
  !pip install -q -U tensorflow_text
  import tensorflow_text as text

import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import tensorflow as tf
import string
import datetime
from bs4 import BeautifulSoup
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import load_model
import tensorflow_hub as hub
from google.colab import drive
import datetime as dt
today = dt.date.today()
yesterday = today - dt.timedelta(days=1)
week_ago = yesterday - dt.timedelta(days=7)  # ensure overlap in pubmed search
days_ago_6 = yesterday - dt.timedelta(days=6) # for text output
# Mount Google Drive
drive.mount('/content/gdrive')
print(today)

[K     |████████████████████████████████| 1.1MB 9.3MB/s 
[K     |████████████████████████████████| 51kB 8.1MB/s 
[K     |████████████████████████████████| 37.6MB 74kB/s 
[K     |████████████████████████████████| 358kB 52.7MB/s 
[K     |████████████████████████████████| 174kB 59.1MB/s 
[K     |████████████████████████████████| 102kB 14.7MB/s 
[K     |████████████████████████████████| 276kB 56.8MB/s 
[K     |████████████████████████████████| 1.2MB 54.6MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for pyyaml (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 2.3MB 7.0MB/s 
[K     |████████████████████████████████| 3.4MB 9.1MB/s 
[?25h[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Mounted at /content/gdrive
2021-01-14


In [3]:
# Define Search Criteria ----
def search(query):
    Entrez.email = 'your.email@example.com'
    handle = Entrez.esearch(db='pubmed',
                            sort='most recent',
                            retmax='5000',
                            retmode='xml',
                            datetype='pdat',
                            # reldate=7,  # only within n days from now
                            mindate= min_date,
                            maxdate= max_date,  # for searching date range
                            term=query)
    results = Entrez.read(handle)
    return results


# Perform Search and Pull Paper Titles ----
def fetch_details(ids):
    Entrez.email = 'your.email@example.com'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results


# Make the stop words for string cleaning ----
def html_strip(text):
    text = BeautifulSoup(text, 'lxml').text
    text = text.replace('[','').replace(']','')
    return text

def clean_str(text, stops):
    text = BeautifulSoup(text, 'lxml').text
    text = text.split()
    return ' '.join([word for word in text if word not in stops])

stop = list(stopwords.words('english'))
stop_c = [string.capwords(word) for word in stop]
for word in stop_c:
    stop.append(word)

new_stop = ['The', 'An', 'A', 'Do', 'Is', 'In', 'StringElement', 
            'NlmCategory', 'Label', 'attributes', 'INTRODUCTION',
            'METHODS', 'BACKGROUND', 'RESULTS', 'CONCLUSIONS']
for s in new_stop:
    stop.append(s)

# Search terms (can test string with Pubmed Advanced Search) ----
# search_results = search('(Biomech*[Title/Abstract] OR locomot*[Title/Abstract])')
min_date = week_ago.strftime('%m/%d/%Y')
max_date = yesterday.strftime('%m/%d/%Y')
search_results = search('(biomech*[Title/Abstract] OR locomot*[Title/Abstract] NOT opiod*[Title/Abstract] NOT pharm*[Journal] NOT mice[Title/Abstract] NOT rats[Title/Abstract] NOT elegans[Title/Abstract])')
id_list = search_results['IdList']
papers = fetch_details(id_list)
print(len(papers['PubmedArticle']), 'Papers found')

titles, full_titles, keywords, authors, links, journals, abstracts = ([] for i in range(7))

for paper in papers['PubmedArticle']:
    # clean and store titles, abstracts, and links
    t = clean_str(paper['MedlineCitation']['Article']['ArticleTitle'], 
                  stop).replace('[','').replace(']','').capitalize()  # rm brackets that survived beautifulsoup, sentence case
    titles.append(t)
    full_titles.append(paper['MedlineCitation']['Article']['ArticleTitle'])
    pmid = paper['MedlineCitation']['PMID']
    links.append('[URL="https://www.ncbi.nlm.nih.gov/pubmed/{0}"]{1}[/URL]'.format(pmid, html_strip(paper['MedlineCitation']['Article']['ArticleTitle'])))
    try:
        abstracts.append(clean_str(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0], 
                                    stop).replace('[','').replace(']','').capitalize())  # rm brackets that survived beautifulsoup, sentence case
    except:
        abstracts.append('')

    # clean and store authors
    auths = []
    try:
        for auth in paper['MedlineCitation']['Article']['AuthorList']:
            try:  # see if there is a last name and initials
                auth_name = [auth['LastName'], auth['Initials'] + ',']
                auth_name = ' '.join(auth_name)
                auths.append(auth_name)
            except:
                if 'LastName' in auth.keys():  # maybe they don't have initials
                    auths.append(auth['LastName'] + ',')
                else:  # no last name
                    auths.append('')
                    print(paper['MedlineCitation']['Article']['ArticleTitle'],
                          'has an issue with an author name:')

    except:
        auths.append('AUTHOR NAMES ERROR')
        print(paper['MedlineCitation']['Article']['ArticleTitle'], 'has no author list?')
    # compile authors
    authors.append(' '.join(auths).replace('[','').replace(']',''))  # rm brackets in names
    # journal names
    journals.append(paper['MedlineCitation']['Article']['Journal']['Title'].replace('[','').replace(']',''))  # rm brackets

    # store keywords 
    if paper['MedlineCitation']['KeywordList'] != []:
        kwds = []
        for kw in paper['MedlineCitation']['KeywordList'][0]:
            kwds.append(kw[:])
        keywords.append(', '.join(kwds).lower())
    else:
      keywords.append('')

# Put Titles, Abstracts, Authors, Journal, and Keywords into dataframe
papers_df = pd.DataFrame({'title': titles,
                          'keywords': keywords,
                          'abstract': abstracts,
                          'authors': authors,
                          'journal': journals,
                          'links': links,
                          'raw_title': full_titles,
                          'mindate': min_date,
                          'maxdate': max_date})


# remove papers with no title or no authors
for index, row in papers_df.iterrows():
    if row['title'] == '' or row['authors'] == 'AUTHOR NAMES ERROR':
        papers_df.drop(index, inplace=True)
papers_df.reset_index(drop=True, inplace=True)

# join titles and abstract
papers_df['BERT_input'] = pd.DataFrame(papers_df['title'] + ' ' + papers_df['abstract'])

# Load Fine-Tuned BERT Network ----
model = tf.saved_model.load('/content/gdrive/My Drive/BiomchBERT/Data/BiomchBERT/')
print('Loaded model from disk')

# Load Label Encoder ----
le = LabelEncoder()
le.classes_ = np.load('/content/gdrive/My Drive/BiomchBERT/Data/BERT_label_encoder.npy')
print('Loaded Label Encoder')


109 Papers found
Loaded model from disk
Loaded Label Encoder


In [86]:
# Predict Paper Topic ----
predicted_topic = model(papers_df['BERT_input'], training=False)  # will run out of GPU memory (14GB) if predicting more than ~2000 title+abstracts at once

In [94]:
# Determine Publications that BiomchBERT is unsure about ----
topics, pred_val_str = ([] for i in range(2))

for pred_prob in predicted_topic:
    pred_val = np.max(pred_prob)
    if pred_val > 1.5 * np.sort(pred_prob)[-2]:  # Is top confidence score more than 1.5x the second best confidence score?
        topics.append(le.inverse_transform([np.argmax(pred_prob)])[0])
        top1 = le.inverse_transform([np.argmax(pred_prob)])[0]
        top2 = le.inverse_transform([list(pred_prob).index([np.sort(pred_prob)[-2]])])[0]
        # pred_val_str.append(pred_val * 100)  # just report top category
        pred_val_str.append(str(np.round(pred_val * 100, 1)) + '% ' + str(top1) + '; ' + str(
            np.round(np.sort(pred_prob)[-2] * 100, 1)) + '% ' + str(top2))  # report top 2 categories
    else:
        topics.append('UNKNOWN')
        top1 = le.inverse_transform([np.argmax(pred_prob)])[0]
        top2 = le.inverse_transform([list(pred_prob).index([np.sort(pred_prob)[-2]])])[0]
        pred_val_str.append(str(np.round(pred_val * 100, 1)) + '% ' + str(top1) + '; ' + str(
            np.round(np.sort(pred_prob)[-2] * 100, 1)) + '% ' + str(top2))
        
papers_df['topic'] = topics
papers_df['pred_val'] = pred_val_str

print('BiomchBERT is unsure about {0} papers\n'.format(len(papers_df[papers_df['topic'] == 'UNKNOWN'])))


BiomchBERT is unsure about 13 papers



In [95]:
# Prompt User to decide for BiomchBERT ----
unknown_papers = papers_df[papers_df['topic'] == 'UNKNOWN']
for indx, paper in unknown_papers.iterrows():
  print(paper['raw_title'])
  print(paper['journal'])
  print(paper['pred_val'])
  print()
  splt_str = paper['pred_val'].split(';')
  options = [str for pred_cls in splt_str for str in le.classes_ if (str in pred_cls)]


  choice = input('(1)st topic, (2)nd topic, (o)ther topic, or (r)emove paper? ')
  print()
  if choice == '1':
    papers_df.iloc[indx]['topic'] = str(options[0])
  elif choice == '2':
    papers_df.iloc[indx]['topic'] = str(options[1])
  elif choice == 'o':
    # print all categories so you can select
    for i in zip(range(len(le.classes_)),le.classes_):
      print(i)  
    new_cat = input('Enter number of new class or type "r" to remove paper: ')
    print()
    if new_cat == 'r':
      papers_df.iloc[indx]['topic'] = '_REMOVE_'  # not deleted, but withheld from text file output
    else:
      papers_df.iloc[indx]['topic'] = le.classes_[int(new_cat)] 
  elif choice == 'r':
    papers_df.iloc[indx]['topic'] = '_REMOVE_'  # not deleted, but withheld from text file output

print('Removing {0} papers\n'.format(len(papers_df[papers_df['topic'] == '_REMOVE_'])))


Impact of Activity-Based Therapy on Respiratory Outcomes in a Medically Complex Child.
Children (Basel, Switzerland)
48.8% ERGONOMICS; 42.7% REHABILITATION

(1)st topic, (2)nd topic, (o)ther topic, or (r)emove paper? 2

Biogeography a key influence on distal forelimb variation in horses through the Cenozoic.
Proceedings. Biological sciences
44.8% COMPARATIVE; 36.3% EVOLUTION/ANTHROPOLOGY

(1)st topic, (2)nd topic, (o)ther topic, or (r)emove paper? 2

A portable pen-sized instrumentation to measure stiffness of soft tissues in vivo.
Scientific reports
54.3% METHODS; 43.3% TISSUE/BIOMATERIAL

(1)st topic, (2)nd topic, (o)ther topic, or (r)emove paper? 2

Prevalence of locomotive syndrome in Japanese patients more than 10 years after total hip arthroplasty: A cross-sectional cohort study.
Journal of orthopaedic science : official journal of the Japanese Orthopaedic Association
27.2% ORTHOPAEDICS/SURGERY; 25.1% ERGONOMICS

(1)st topic, (2)nd topic, (o)ther topic, or (r)emove paper? 1

The 

In [97]:
# Double check that none of these papers were included in past literature updates ----
# load prior papers
# papers_df.to_csv('/content/gdrive/My Drive/BiomchBERT/Updates/prior_papers.csv', index=False)  # run ONLY if there are no prior papers
prior_papers = pd.read_csv('/content/gdrive/My Drive/BiomchBERT/Updates/prior_papers.csv')
prior_papers.dropna(subset=['title'], inplace=True)
prior_papers.reset_index(drop=True, inplace=True)

# find matching titles between current week and prior papers
match = papers_df['title'].isin(prior_papers['title'])  # boolean

# filter and check if everything accidentally was removed
filtered_papers_df = papers_df.drop(papers_df[match].index)
if filtered_papers_df.shape[0] < 1:
    raise ValueError('might have removed all the papers for some reason. ')
else:
    papers_df = filtered_papers_df
    papers_df.reset_index(drop=True, inplace=True)
    updated_prior_papers = pd.concat([prior_papers, papers_df], axis=0)
    updated_prior_papers.reset_index(drop=True, inplace=True)
    updated_prior_papers.to_csv('/content/gdrive/My Drive/BiomchBERT/Updates/prior_papers.csv', index=False)

In [98]:
# Create Text File for Biomch-L ----
# Compile papers grouped by topic
txtname = '/content/gdrive/My Drive/BiomchBERT/Updates/' + today.strftime("%Y-%m-%d") + '-litupdate.txt'
txt = open(txtname, 'w', encoding='utf-8')
txt.write('[SIZE=16px][B]LITERATURE UPDATE[/B][/SIZE]\n')
txt.write(days_ago_6.strftime("%b %d, %Y") + ' - '+ yesterday.strftime("%b %d, %Y")+'\n')  # a week ago from yesterday.
txt.write(
    """
Literature search terms: biomech* & locomot*

Publications are classified by [URL="https://www.ryan-alcantara.com/projects/p88_BiomchBERT/"]BiomchBERT[/URL], a neural network trained on past Biomch-L Literature Updates. BiomchBERT is managed by [URL="https://www.ryan-alcantara.com"]Ryan Alcantara[/URL], a PhD Candidate at the University of Colorado Boulder. Each publication has a score (out of 100%) reflecting how confident BiomchBERT is that the publication belongs in a particular category (top 2 shown). If something doesn't look right, email ryan.alcantara[at]colorado.edu.

Twitter: [URL="https://www.twitter.com/Ryan_Alcantara_"]@Ryan_Alcantara_[/URL]. 


    """
    )

# Write papers to text file grouped by topic ----
topic_list = np.unique(papers_df.sort_values('topic')['topic'])

for topic in topic_list:
    papers_subset = pd.DataFrame(papers_df[papers_df.topic == topic].reset_index(drop=True))
    txt.write('\n')
    # TOPIC NAME (with some cleaning)
    if topic == '_REMOVE_':
      continue
    elif topic == 'UNKNOWN':
        txt.write('[SIZE=16px][B]*Papers BiomchBERT is unsure how to classify*[/B][/SIZE]\n')
    elif topic == 'CARDIOVASCULAR/CARDIOPULMONARY':
      topic = 'CARDIOVASCULAR/PULMONARY'
      txt.write('[SIZE=16px][B]*%s*[/B][/SIZE]\n' % topic)
    elif topic == 'CELLULAR/SUBCELLULAR':
      topic = 'CELLULAR'
      txt.write('[SIZE=16px][B]*%s*[/B][/SIZE]\n' % topic)
    elif topic == 'ORTHOPAEDICS/SURGERY':
      topic = 'ORTHOPAEDICS (SURGERY)'
      txt.write('[SIZE=16px][B]*%s*[/B][/SIZE]\n' % topic)
    elif topic == 'ORTHOPAEDICS/SPINE':
      topic = 'ORTHOPAEDICS (SPINE)'
      txt.write('[SIZE=16px][B]*%s*[/B][/SIZE]\n' % topic)
    else:
        txt.write('[SIZE=16px][B]*%s*[/B][/SIZE]\n' % topic)
    # HYPERLINKED PAPERS, AUTHORS, JOURNAL NAME
    for i, paper in enumerate(papers_subset['links']):
        txt.write('[B]%s[/B] ' % paper)
        txt.write('%s ' % papers_subset['authors'][i])
        txt.write('[I]%s[/I]. ' % papers_subset['journal'][i])
        # CONFIDENCE SCORE (BERT softmax categorical crossentropy)
        try:
            txt.write('(%.1f%%) \n\n' % papers_subset['pred_val'][i])
        except:
            txt.write('(%s)\n\n' % papers_subset['pred_val'][i]) 

txt.write('[SIZE=16px][B]*PICK OF THE WEEK*[/B][/SIZE]\n')
txt.close()
print('Literature Update Exported for Biomch-L')
print('Location:', txtname)

Literature Update Exported for Biomch-L
Location: /content/gdrive/My Drive/BiomchBERT/Updates/2021-01-14-litupdate.txt
