# Build and Clean Dataset


This notebook cleans up and consolidates the NIH grant datasets. 

Inputs:
 - ../data/raw/ should contain csv datasets downloaded from NIH project exporter
Outputs: 
 - ../data/clean/NIH_grants.csv should be cleaned and consolidated dataset
 
 
Steps:
- For each abstract, convert all words to lowercase and remove alphanumeric characters
- Lemmatize all words using en_core_web_sm model from gensim
- Create bigram model and learn word pairs
- Output single dataframe with all grant information and cleaned, lemmatized, bigrammed grant abstracts
 
Bigram models will be saved to ../models/. 

In [1]:
import sys

sys.path.append('../')

In [2]:
from time import time
from grantminer.config import filename, filepath
import grantminer.data as data
import pandas as pd

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)



Spacy en_core_web_sm loaded!


In [3]:
years = [2018, 2019, 2020]
agg = []

for y in years:
    
    print('Cleaning up year: '+str(y))
    
    t_ = time()
    
    d = data.load(y,abstract=True,verbose=True) \
        .pipe(data.clean_text_from_df,'ABSTRACT_TEXT',verbose=True) \
        .drop(labels=['ABSTRACT_TEXT'],axis=1) \
        .assign(Year=y)
    
    d.to_csv(filename(y,abstract=True,fpath='processed'))
    
    print('Time to clean up: {} mins'.format(round((time() - t_) / 60, 2)))
    
    agg.append(d)


nih_abstracts = pd.concat(agg).reset_index(drop=True)

Cleaning up year: 2018
Reading filename: ../data/raw/RePORTER_PRJABS_C_FY2018_new.csv
Number of lines: 80395
Lowercased and removed non-alphanumeric characters.
Cleaned and lemmatized.
Time to clean up: 38.08 mins
Cleaning up year: 2019
Reading filename: ../data/raw/RePORTER_PRJABS_C_FY2019_new.csv
Number of lines: 79107
Lowercased and removed non-alphanumeric characters.
Cleaned and lemmatized.
Time to clean up: 35.13 mins
Cleaning up year: 2020
Reading filename: ../data/raw/RePORTER_PRJABS_C_FY2020_new.csv
Number of lines: 78028
Lowercased and removed non-alphanumeric characters.
Cleaned and lemmatized.
Time to clean up: 32.42 mins


In [4]:
application_fields = pd.concat([data.load(y) for y in years]).reset_index(drop=True)

cleaned = application_fields.merge(nih_abstracts,on='APPLICATION_ID',how='right')
cleaned.to_csv(filepath['processed'] / 'NIH_grants_cleaned.csv')

b'Skipping line 55941: expected 46 fields, saw 47\n'
b'Skipping line 3706: expected 46 fields, saw 47\n'
b'Skipping line 63777: expected 46 fields, saw 47\n'


In [20]:
# Remove blanks
blanks = cleaned['clean'].apply(type)!=str
blanks.sum()

362

In [21]:
cleaned = cleaned.drop(cleaned.index[blanks],axis=0)

In [22]:
cleaned.to_csv(filepath['processed'] / 'NIH_grants_cleaned.csv')

In [23]:
# Learn bigram model using Gensim Phraser
bigram_model = data.find_bigrams(cleaned,'clean')
bigram_model.save(str(filepath['bigram']))

INFO - 15:15:05: collecting all words and their counts
INFO - 15:15:05: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 15:15:08: PROGRESS: at sentence #10000, processed 2379668 words and 1092174 word types
INFO - 15:15:11: PROGRESS: at sentence #20000, processed 4718402 words and 1829551 word types
INFO - 15:15:14: PROGRESS: at sentence #30000, processed 7115000 words and 2470831 word types
INFO - 15:15:17: PROGRESS: at sentence #40000, processed 9498333 words and 3039229 word types
INFO - 15:15:19: PROGRESS: at sentence #50000, processed 11833021 words and 3537151 word types
INFO - 15:15:22: PROGRESS: at sentence #60000, processed 14159768 words and 3986345 word types
INFO - 15:15:25: PROGRESS: at sentence #70000, processed 16493681 words and 4394584 word types
INFO - 15:15:28: PROGRESS: at sentence #80000, processed 18825366 words and 4759227 word types
INFO - 15:15:31: PROGRESS: at sentence #90000, processed 21242987 words and 4847630 word types
INFO - 15:15:34:

In [24]:
# Freeze the model into smaller more efficient version
bigram_model = data.freeze_bigram(bigram_model)
bigram_model.save(str(filepath['bigram_frozen']))

INFO - 15:16:18: exporting phrases from Phrases<6700854 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 15:16:31: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<19309 phrases, min_count=30, threshold=10.0> from Phrases<6700854 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 13.10s', 'datetime': '2021-05-04T15:16:31.644659', 'gensim': '4.0.1', 'python': '3.8.8 | packaged by conda-forge | (default, Feb 20 2021, 16:12:38) \n[Clang 11.0.1 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}
INFO - 15:16:31: FrozenPhrases lifecycle event {'fname_or_handle': '../models/bigram_frozen.pkl', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-05-04T15:16:31.974993', 'gensim': '4.0.1', 'python': '3.8.8 | packaged by conda-forge | (default, Feb 20 2021, 16:12:38) \n[Clang 11.0.1 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'saving'}
INFO - 15:16:31: saved ../models/bigram_frozen.pkl

In [25]:
cleaned['clean'] = cleaned['clean'].apply(lambda x: ' '.join(bigram_model[x.split()]))

In [29]:
cleaned.reset_index(drop=True).to_csv(filepath['clean'] / 'NIH_grants.csv')