# Health Related Federal Grant Tracking
----------------------------------------------------------------

The US government invests billions each year into the health research. The "open government" initiative provides more transparency into the governmental activities in the health related sector. Grant application data from NIH, ACF, AHRQ, CDC, HRSA, FDA, and VA are now easily accessible. With that large amount of data, we can now track the trend of the health research according to the amount of grant money. What research topics/areaes get more attractive? How have they been changing over the years? What area in the health sector will likely be the next breakthrough? I will try to explore those questions from the grant application database. 

First of all, the database called NIH ExPORTER is accesible [here](https://exporter.nih.gov/ExPORTER_Catalog.aspx). A total of 7.3 GB data were downloaded. 

In [2]:
%matplotlib inline
import matplotlib
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sns
import os
import dill
from tqdm import tqdm_notebook
import gensim
import pickle
import pyLDAvis.gensim
sns.set()
matplotlib.rcParams['figure.dpi'] = 144

In [22]:
from tqdm import tqdm_notebook
fns = os.listdir('./Data/PRJABS/')
fns.sort()
fns = ['./Data/PRJABS/'+x for x in fns]
proj_ABS = [pd.read_csv(fn,header = 0,dtype = {'APPLICATION_ID': str,'ABSTRACT_TEXT': str},engine = 'python') for fn in tqdm_notebook(fns)]


In [58]:
#add fiscal year number,fix the data type
for i in range(len(proj_ABS)):
    proj_ABS[i]['FY'] = i + 1985
    proj_ABS[i].apply(str)
#concate the whole table
proj_ABS = pd.concat(proj_ABS)

In [90]:
#all the non-missing entries
proj_ABS = proj_ABS[proj_ABS['ABSTRACT_TEXT']!= 'nan']

In [93]:
dill.dump(proj_ABS,open('abstracts.dill','wb'))

In [3]:
proj_ABS = dill.load(open('abstracts.dill','rb'))

# Unsupervised topic modeling with the abstracts
With all the non-missing text files in the dataset, I will start the NLP to extract some basic labels. I will try to set the number of topics ahead of time to evaluate what is the best number of topics. I will mainly use the Latent Dirichlet Allocation (LDA) topic modeling technique.

The abstracts will first be cleaned. Steps include parsing and tokenization, whitespace striping, lowering the words, word lemmatizing, and stop-words cleaning. The bags of word tokens will then be ready for the next step.  

In [3]:
text = proj_ABS['ABSTRACT_TEXT']
text_update = text[:1763542].append(text[1763549:]) #delete those entries that cause a lot of troubles

In [54]:
t_2017 = proj_ABS.query('FY == 2017')[['ABSTRACT_TEXT']]
t_2017[29384:29391] = 'nan' #need to update this and the followin line to pd.drop
t_2017 = t_2017[t_2017['ABSTRACT_TEXT'] != 'nan']
t_2017 = [prepare_text_for_lda(x) for x in tqdm_notebook(t_2017['ABSTRACT_TEXT'])]

In [69]:
#Text Cleaning
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

#word lemmatize to get the root word
import nltk
#nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

#clean stop words
#nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

#ensemble function
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [8]:
n = len(text)//10
n,len(text),2*n

(180658, 1806580, 361316)

In [13]:
n = len(text)//10
text_data1 = [prepare_text_for_lda(abs) for abs in tqdm_notebook(list(text)[:n])]




In [14]:
dill.dump(text_data1,open('abs_tokens1.dill','wb'))

In [5]:
text_data1 = dill.load(open('abs_tokens1.dill','rb'))

In [16]:
text_data2 = [prepare_text_for_lda(abs) for abs in tqdm_notebook(list(text)[n:2*n])]




In [17]:
dill.dump(text_data2,open('abs_tokens2.dill','wb'))

In [8]:
text_data2 = dill.load(open('abs_tokens2.dill','rb'))

In [10]:
text_data3 = [prepare_text_for_lda(abs) for abs in tqdm_notebook(list(text)[2*n:3*n])]




In [11]:
dill.dump(text_data3,open('abs_tokens3.dill','wb'))

In [9]:
text_data3 = dill.load(open('abs_tokens3.dill','rb'))

In [13]:
text_data4 = [prepare_text_for_lda(abs) for abs in tqdm_notebook(list(text)[3*n:4*n])]




In [14]:
dill.dump(text_data4,open('abs_tokens4.dill','wb'))

In [None]:
text_data4 = dill.load(open('abs_tokens4.dill','rb'))

In [9]:
text_data5 = [prepare_text_for_lda(abs) for abs in tqdm_notebook(list(text)[4*n:5*n])]




In [10]:
dill.dump(text_data5,open('abs_tokens5.dill','wb'))

In [11]:
text_data6 = [prepare_text_for_lda(abs) for abs in tqdm_notebook(list(text)[5*n:6*n])]




In [12]:
dill.dump(text_data6,open('abs_tokens6.dill','wb'))

In [9]:
text_data7 = [prepare_text_for_lda(abs) for abs in tqdm_notebook(list(text)[6*n:7*n])]




In [10]:
dill.dump(text_data7,open('abs_tokens7.dill','wb'))

In [9]:
text_data8 = [prepare_text_for_lda(abs) for abs in tqdm_notebook(list(text)[7*n:8*n])]




In [10]:
dill.dump(text_data8,open('abs_tokens8.dill','wb'))

In [9]:
text_data9 = [prepare_text_for_lda(abs) for abs in tqdm_notebook(list(text)[8*n:9*n])]




In [10]:
dill.dump(text_data9,open('abs_tokens9.dill','wb'))

In [17]:
text_data10 = [prepare_text_for_lda(abs) for abs in tqdm_notebook(list(text)[9*n:9*n+137620])]

In [23]:
9*n+137620,9*n+137627

(1763542, 1763549)

In [77]:
#list(text)[9*n+137620:9*n+137627]

In [18]:
dill.dump(text_data10,open('abs_tokens10.dill','wb'))

In [21]:
text_data11 = [prepare_text_for_lda(abs) for abs in tqdm_notebook(list(text)[9*n+137627:9*n+137665])]

In [24]:
dill.dump(text_data11,open('abs_tokens11.dill','wb'))

In [11]:
text_data12 = [prepare_text_for_lda(abs) for abs in tqdm_notebook(list(text)[9*n+137665:])]

In [12]:
dill.dump(text_data12,open('abs_tokens12.dill','wb'))

In [78]:
#len(text)

In [5]:
text_update = text[:1763542].append(text[1763549:]) #delete those entries that cause a lot of troubles

NameError: name 'text' is not defined

In [27]:
len(text_update)

1806573

In [None]:
text_data = [prepare_text_for_lda(abs) for abs in tqdm_notebook(list(text_update))]

In [None]:
dill.dump(text_data,open('abs_tokens.dill','wb'))

Now that, the bags of words are ready. We can start getting the topics out of them. 

In [1]:
len(text_data1)
#text_data1[:2]

NameError: name 'text_data1' is not defined

In [10]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data1)
corpus = [dictionary.doc2bow(text) for text in tqdm_notebook(text_data1)]




In [30]:
pickle.dump(corpus, open('corpus1.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [11]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.026*"cell" + 0.014*"study" + 0.010*"tumor" + 0.010*"human" + 0.010*"antibody" + 0.009*"antigen" + 0.008*"response" + 0.008*"disease" + 0.008*"infection" + 0.008*"determine"')
(1, '0.028*"protein" + 0.012*"study" + 0.011*"sequence" + 0.011*"gene" + 0.009*"expression" + 0.009*"specific" + 0.008*"molecular" + 0.008*"receptor" + 0.008*"structure" + 0.008*"binding"')
(2, '0.016*"study" + 0.013*"research" + 0.011*"patient" + 0.008*"clinical" + 0.008*"program" + 0.007*"project" + 0.007*"cancer" + 0.007*"treatment" + 0.007*"group" + 0.006*"provide"')
(3, '0.015*"study" + 0.010*"brain" + 0.009*"receptor" + 0.009*"system" + 0.009*"neuron" + 0.008*"effects" + 0.007*"determine" + 0.006*"response" + 0.006*"mechanism" + 0.006*"change"')
(4, '0.016*"study" + 0.008*"determine" + 0.007*"mechanism" + 0.006*"effects" + 0.006*"metabolism" + 0.005*"change" + 0.005*"increase" + 0.005*"tissue" + 0.005*"blood" + 0.004*"muscle"')


In [18]:
ldamodel.get_document_topics(corpus[10])

[(2, 0.99246734)]

In [4]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus1.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=True)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [74]:
from gensim import corpora
from gensim.models import TfidfModel
words_rm = ['program','research','training','health','center','support','investigator',
            'community','resource','trainee','science','student','administrative',
            'abstract','project','career','faculty','services','university','contract',
            'mentor','provide','meeting','school','director','collaboration','facility',
            'institute','department','researcher','biology','member']
t_2017 = proj_ABS.query('FY == 2017')[['ABSTRACT_TEXT']]
#t_2017.drop(t_2017.index([range(29384,29391)]),inplace=True)
t_2017[29384:29391] = 'nan' #need to update this and the followin line to pd.drop
t_2017 = t_2017[t_2017['ABSTRACT_TEXT'] != 'nan']
t_2017 = [prepare_text_for_lda(x) for x in tqdm_notebook(t_2017['ABSTRACT_TEXT'])]
#t_2017 = [[word for word in abs if word not in words_rm] for abs in t_2017]
dict_2017 = corpora.Dictionary(t_2017)
corpus_2017 = [dict_2017.doc2bow(text) for text in tqdm_notebook(t_2017)]
model = TfidfModel(corpus_2017)
vector_2017 = model[corpus_2017]

In [75]:
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(vector_2017, num_topics = NUM_TOPICS, id2word=dict_2017, passes=5)
#ldamodel.save('model10.gensim')

lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus_2017, dict_2017, sort_topics=True)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [76]:
pickle.dump(corpus_2017, open('corpus_2017.pkl', 'wb'))
dict_2017.save('dict_2017.gensim')
ldamodel.save('model5_2017.gensim')