Let's find some wearables papers and merge them into the biomchBERT dataset...

In [16]:
# Install & load libraries

try:
  from Bio import Entrez
except:
  !pip install -q -U biopython
  from Bio import Entrez
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import string
from bs4 import BeautifulSoup

from google.colab import drive
import datetime as dt

# Mount Google Drive
drive.mount('/content/gdrive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [61]:
# read in pubmed search results
# search term: (sensor[Title/Abstract] OR wearable[Title/Abstract] NOT robot*[Title/Abstract]) AND (biomech*[Title/Abstract] OR locomot*[Title/Abstract])

pubmed_in = pd.read_csv('/content/gdrive/My Drive/literature_update/Data/csv-sensorTitl-set.csv')

id_list = pubmed_in['PMID']

In [62]:

# Perform Search and Pull Paper Titles ----
def fetch_details(ids):
    Entrez.email = 'your.email@example.com'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results


# Make the stop words for string cleaning ----
def clean_str(text, stops):
    text = BeautifulSoup(text, 'lxml').text
    text = text.split()
    return ' '.join([word for word in text if word not in stops])

stop = list(stopwords.words('english'))
stop_c = [string.capwords(word) for word in stop]
for word in stop_c:
    stop.append(word)

new_stop = ['The', 'An', 'A', 'Do', 'Is', 'In', 'StringElement', 
            'NlmCategory', 'Label', 'attributes', 'INTRODUCTION',
            'METHODS', 'BACKGROUND', 'RESULTS', 'CONCLUSIONS']
for s in new_stop:
    stop.append(s)

papers = fetch_details(id_list)
print(len(papers['PubmedArticle']), 'Papers found')

titles, full_titles, keywords, authors, links, journals, abstracts = ([] for i in range(7))

for paper in papers['PubmedArticle']:
    # clean and store titles, abstracts, and links
    t = paper['MedlineCitation']['Article']['ArticleTitle']
    titles.append(t)
    full_titles.append(paper['MedlineCitation']['Article']['ArticleTitle'])
    pmid = paper['MedlineCitation']['PMID']
    links.append('[URL="https://www.ncbi.nlm.nih.gov/pubmed/{0}"]{1}[/URL]'.format(pmid, t))
    try:
        abstracts.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])  # rm brackets that survived beautifulsoup, sentence case
    except:
        abstracts.append('')

    # clean and store authors
    auths = []
    try:
        for auth in paper['MedlineCitation']['Article']['AuthorList']:
            try:  # see if there is a last name and initials
                auth_name = [auth['LastName'], auth['Initials'] + ',']
                auth_name = ' '.join(auth_name)
                auths.append(auth_name)
            except:
                if 'LastName' in auth.keys():  # maybe they don't have initials
                    auths.append(auth['LastName'] + ',')
                else:  # no last name
                    auths.append('')
                    print(paper['MedlineCitation']['Article']['ArticleTitle'],
                          'has an issue with an author name:')
        auths[-1] = auths[-1][0:-1]  # remove comma after last author
    except:
        auths.append('AUTHOR NAMES ERROR')
        print(paper['MedlineCitation']['Article']['ArticleTitle'], 'has no author list?')
    # compile authors
    authors.append("['" + ' '.join(auths).replace('[','').replace(']','') + "']")  # rm brackets in names
    # journal names
    journals.append(paper['MedlineCitation']['Article']['Journal']['Title'].replace('[','').replace(']',''))  # rm brackets

    # store keywords 
    if paper['MedlineCitation']['KeywordList'] != []:
        kwds = []
        for kw in paper['MedlineCitation']['KeywordList'][0]:
            kwds.append(kw[:])
        keywords.append(', '.join(kwds).lower())
    else:
      keywords.append('')

# Put Titles, Abstracts, Authors, Journal, and Keywords into dataframe
papers_df = pd.DataFrame({'title': titles,
                          'keywords': keywords,
                          'abstract': abstracts,
                          'authors': authors,
                          'journal': journals,
                          'links': links,
                          'raw_title': full_titles,
                          'mindate': "2020/12/12",
                          'maxdate': "2010/1/1"})


# remove papers with no title or no authors
for index, row in papers_df.iterrows():
    if row['title'] == '' or row['authors'] == 'AUTHOR NAMES ERROR':
        papers_df.drop(index, inplace=True)
papers_df.reset_index(drop=True, inplace=True)

# join titles and abstract
papers_df['BERT_input'] = pd.DataFrame(papers_df['title'] + ' ' + papers_df['abstract'])



1122 Papers found


In [63]:
# read in old biomch-L papers

## read in data ----
df = pd.read_csv('/content/gdrive/My Drive/literature_update/Data/Biomch-L_papers.csv', encoding='UTF-8-SIG')
df.columns = ['X','topic_split', 'topic', 'authors','title','journal','year','vol_issue','doi','abstract']

# keep just topics where there are at least 900 entries:
df = df.groupby('topic').filter(lambda x: len(x) > 900)
# remove unique topics
df.drop(df[df['topic'] == 'UNIQUETOPIC'].index, inplace=True)

# total number of topics
n_topics = len(df.groupby('topic').size())


df['title'] = df['title'].replace(np.nan, '', regex = True)
df['abstract'] = df['abstract'].replace(np.nan, '', regex = True)
df.drop(['X', 'topic_split', 'authors', 'journal', 'year', 'vol_issue', 'doi'], axis=1, inplace=True)
df.head()
df.sort_index(inplace=True)

## clean up title and abstract text ----
#set cleaning parameters
def clean(t):
    t = t.split()
    return ' '.join([(i) for (i) in t if i not in stop])

stop = list(stopwords.words('english'))
stop.append('The')
stop.append('An')
stop.append('A')
stop.append('Do')
stop.append('Is')
stop.append('In')

new_stop = ['StringElement','NlmCategory','Label','attributes','INTRODUCTION',
            'METHODS','BACKGROUND','RESULTS','CONCLUSIONS']
for item in new_stop:
    stop.append(item)

#apply cleaning to title and abstract text
df['title'] = df['title'].apply(clean)
df['abstract'] = df['abstract'].apply(clean)
old_papers = df.copy()

wearable_papers = papers_df.copy()
wearable_papers['title'] = wearable_papers['title'].apply(clean)
wearable_papers['abstract'] = wearable_papers['abstract'].apply(clean)

In [64]:
print(wearable_papers.columns)
print(old_papers.columns)

Index(['title', 'keywords', 'abstract', 'authors', 'journal', 'links',
       'raw_title', 'mindate', 'maxdate', 'BERT_input'],
      dtype='object')
Index(['topic', 'title', 'abstract'], dtype='object')


In [65]:
# Double check that none of these papers were included in past literature updates ----
old_papers.dropna(subset=['title'], inplace=True)
old_papers.reset_index(drop=True, inplace=True)

match = wearable_papers['title'].isin(old_papers['title'])  # boolean for matching titles between this week and prior papers

wearable_papers.drop(wearable_papers[match].index, inplace=True)
wearable_papers.reset_index(drop=True, inplace=True)

print(wearable_papers.shape)
wearable_papers.to_csv('/content/gdrive/My Drive/literature_update/Data/wearable_papers.csv', index=False)
# then the top 700 were taken and filtered by hand. about 500 were kept. 

(1119, 10)
