In [None]:
from Bio import Entrez
from Bio import Medline
import pandas as pd
import time
import numpy as np
from ratelimiter import RateLimiter
import pubmed_parser as pp
import nltk
import matplotlib
Entrez.email = 'amanda.sawyer@nih.gov'
api_key = '86d72be66a4381e2e22c704615cbb9620c08'

In [None]:
proposedTermsPath = 'mesh-proposed-terms.csv'
proposedTerms = pd.read_csv(proposedTermsPath)

### 1. Title and Title/Abstract Posting from PubMed
**Method 1: Testing Efficiency by using apply() and RateLimiter**

In [None]:
#define functions for title and title/abstract
#RateLimiter limits to max of 10 calls per second using my NCBI key, per their requirements: https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/
#Find information on RateLimiter here: https://pypi.org/project/ratelimiter/
#Find information on Pandas apply() function here: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html

@RateLimiter(max_calls=10, period=1)
def getPubMedTitleTotal (term):
    termCounts = 0
    
    try:
        searchstring = term+' [ti]'
        handle = Entrez.esearch(db='pubmed', term=searchstring)
        result = Entrez.read(handle)
        handle.close()
        termCounts = result['Count']
    except:
        print("Error:", searchstring)
            
    return termCounts

@RateLimiter(max_calls=10, period=1)
def getPubMedTIABTotal (term):
    termCounts = 0
    
    try:
        searchstring = term+' [tiab]'
        handle = Entrez.esearch(db='pubmed', term=searchstring)
        result = Entrez.read(handle)
        handle.close()
        termCounts = result['Count']
    except:
        print("Error:", searchstring)
            
    return termCounts

In [None]:
%%time
#use apply(), and Python magic time function
proposedTerms['Title Count'] = proposedTerms['term'].apply(getPubMedTitleTotal)
proposedTerms['Title Abstract Count'] = proposedTerms['term'].apply(getPubMedTIABTotal)

In [None]:
proposedTerms

**Method 2 - Original Method using iteration instead of apply.**

In [None]:
def getPubMedtotal(searchstring):
    thingtoreturn=0

    time.sleep(3)
    try:
        handle = Entrez.esearch(db="pubmed", term=searchstring, retmax = 1000)
        result = Entrez.read(handle)
        handle.close()
        thingtoreturn=result["Count"]
    except:
        print("Error:",searchstring)

    return thingtoreturn

In [None]:
List_XLSX = pd.read_csv(proposedTermsPath, index_col=None)

In [None]:
%%time
List_XLSX['TitleSearchTotal']=0
List_XLSX['TIABSearchTotal']=0

for index,row in List_XLSX.iterrows():
    List_XLSX.loc[index,'TitleSearchTotal']=getPubMedtotal(row['term'] + "[ti]")
    if index and not index % 250:
        print(index,"records processed")
print("Done.")

 
for index,row in List_XLSX.iterrows():
    List_XLSX.loc[index,'TIABSearchTotal']=getPubMedtotal(row['term'] + "[tiab]")
    if index and not index % 100:
        print(index,"records processed")
print("Done.")

In [None]:
List_XLSX

### 2. PI and MN from PubMed

In [None]:
@RateLimiter(max_calls=10, period=1)
def getPMIDs (term) :
    handle_test = Entrez.esearch(db='pubmed', term=term+'[ti]', retmax = 1000)
    result_test = Entrez.read(handle_test)
    handle_test.close()
    pmid = result_test['IdList']
    return pmid

@RateLimiter(max_calls=10, period=1)
def returnPMRecords (ids) :
    tester = []
    rate_limiter = RateLimiter(max_calls=3, period=1)

    for x in ids :
        with rate_limiter:
            tester.append(pp.parse_xml_web(x, save_xml=False))
    return tester

In [None]:
newterm = input("Enter search term:")
ids = getPMIDs(newterm)
tester = returnPMRecords(ids)

In [None]:
#created a df
newTermDF = pd.DataFrame(tester)
#split keywords on ;
newTermDF['keywordssplit'] = newTermDF.apply(lambda row: row.keywords.split(';'), axis = 1)
#each keyword gets its own row
newTermDF = newTermDF.explode('keywordssplit')

In [None]:
#created a df for frequency of terms
frequency = pd.DataFrame()
frequency['Term Frequency'] = newTermDF['keywordssplit'].value_counts()
frequency['Percentage'] = round((frequency['Term Frequency'] / newTermDF['keywordssplit'].value_counts().sum()) * 100, 2)
frequency = frequency.reset_index()
frequency[:14]

In [None]:
top15 = frequency[:14]
total = top15['Term Frequency'].sum()

In [None]:
sizes = top15['Term Frequency'] / total * 100
sizes = sizes.tolist()
names = top15['index']
names = names.tolist()

In [None]:
import matplotlib.pyplot as plt
from matplotlib import cm

labels = names
sizes = sizes
cs=cm.gist_rainbow(np.arange(40)/40.)

fig1, ax1 = plt.subplots()
ax1.pie(sizes, colors=cs, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)

ax1.axis('equal')
fig = plt.gcf()
fig.set_size_inches(13, 13)
plt.show()

**Exploring NLTK for Frequency Visualizations**

In [None]:
#set the number of top keywords you want to look at
#https://stackoverflow.com/questions/40206249/count-of-most-popular-words-in-a-pandas-dataframe

import matplotlib
import matplotlib.pyplot as plt
top_N = 25

newTermDF['keywordssplit'] = newTermDF['keywordssplit'].str.replace('[^\w\s]','')

txt = newTermDF.keywordssplit.str.lower().str.replace(r'\|', ' ').str.cat(sep=' ')
words = nltk.tokenize.word_tokenize(txt)
word_dist = nltk.FreqDist(words)

stopwords = nltk.corpus.stopwords.words('english')
words_except_stop_dist = nltk.FreqDist(w for w in words if w not in stopwords) 

print('All frequencies, without STOPWORDS:')
print('=' * 60)
rslt = pd.DataFrame(words_except_stop_dist.most_common(top_N),
                    columns=['Word', 'Frequency']).set_index('Word')
print(rslt)
print('=' * 60)

matplotlib.style.use('ggplot')

rslt.plot.bar()

### 3. Searching for Other Related Terms with Latent Dirichlet Allocation

In [None]:
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [None]:
#create df from title, abstract keywords. using data collected from 1,000 articles for pt. 2. example of "Default Mode Network"
ldadf = pd.DataFrame(tester)
ldadf = ldadf[['title', 'abstract', 'keywords']]

In [None]:
# data cleaning

#space out keywords, remove ;
ldadf['keywords'] = ldadf['keywords'].str.replace(";", " ")

#place everything in lowercase and remove punctuation
ldadf = ldadf.apply(lambda x: x.astype(str).str.lower())
def remove_punctuation(x):
    try:
        x = x.str.replace('[^\w\s]','') #might be worth exploring a str.translate option to save memory/time
    except:pass
    return x

ldadf = ldadf.apply(remove_punctuation)

#remove numbers from keywords
ldadf['keywords'] = ldadf['keywords'].str.replace('\d+', '  ')

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

ldadf['title'] = ldadf.title.apply(lemmatize_text)
ldadf['abstract'] = ldadf.abstract.apply(lemmatize_text)
ldadf['keywords'] = ldadf.keywords.apply(lemmatize_text)

#remove stopwords
ldadf['title'] = ldadf['title'].map(lambda x: [t for t in x if t not in stopwords])
ldadf['abstract'] = ldadf['abstract'].map(lambda x: [t for t in x if t not in stopwords])
ldadf['keywords'] = ldadf['keywords'].map(lambda x: [t for t in x if t not in stopwords])

In [None]:
ldadf

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk

In [None]:
dictionary = gensim.corpora.Dictionary(ldadf['title'])
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

In [None]:
print(dictionary)

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in ldadf['title']]

In [None]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=10, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
ldadf['title'][510]

In [None]:
for index, score in sorted(lda_model[bow_corpus[510]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model_tfidf, corpus, dictionary)

**LDA Attempt 2**

In [None]:
import numpy as np
import logging
#import pyLDAvis.gensim
import json
import warnings
warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity

from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from numpy import array

# Import dataset
p_df = pd.DataFrame(tester)
# Create sample of 10,000 reviews
# Convert to array
docs = array(p_df['keywords'])

# Define function for tokenize and lemmatizing
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

def docs_preprocessor(docs):
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isdigit()] for doc in docs]
    
    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 3] for doc in docs]
    
    # Lemmatize all words in documents.
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
  
    return docs

# Perform function on our document
docs = docs_preprocessor(docs)
#Create Biagram & Trigram Models 
from gensim.models import Phrases
# Add bigrams and trigrams to docs,minimum count 10 means only that appear 10 times or more.
bigram = Phrases(docs, min_count=10)
trigram = Phrases(bigram[docs])

for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)
    for token in trigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)
            
#Remove rare & common tokens 
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=2, no_above=0.2)
#Create dictionary and corpus required for Topic Modeling
corpus = [dictionary.doc2bow(doc) for doc in docs]
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))
print(corpus[:1])

In [None]:
num_topics = 5
chunksize = 500 
passes = 20 
iterations = 400
eval_every = 1  

# Make a index to word dictionary.
temp = dictionary[0]  # only to "load" the dictionary.
id2word = dictionary.id2token

lda_model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

# Print the Keyword in the 5 topics
print(lda_model.print_topics())

In [None]:
import pyLDAvis.gensim

In [None]:
# Compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Compute Coherence Score using UMass
coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence="u_mass")
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=docs, start=2, limit=40, step=6)
# Show graph
import matplotlib.pyplot as plt
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
num_topics = 5
chunksize = 500 
passes = 80 
iterations = 400
eval_every = 1  

# Make a index to word dictionary.
temp = dictionary[0]  # only to "load" the dictionary.
id2word = dictionary.id2token

lda_model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

# Print the Keyword in the 5 topics
print(lda_model.print_topics())

In [None]:
#n_topics = 5
#n_top_words = 9
#my_lda = LdaModel(corpus, num_topics=n_topics, id2word=dictionary, random_state=120, minimum_probability=0)
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)