# Executive summary

* this notebook contains the experimentations in the publication  "a novel NIH research grant recommender using BERT"
* major experimentes conducted
 * creating all testing data, using class from dataProcessing_bert_service, with borrowed data processing from CVProcessing
 * predictions with already trained model (load trained, get results and write results from utilities from utils)

In [1]:
#general 
import os
import argparse
import pickle
import dill
import logging

#you cannot live without 
from tqdm import trange
import pandas as pd
import numpy as np
import time
#import matplotlib.pyplot as plt
import random
from termcolor import colored
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import average_precision_score, roc_auc_score
from scipy.special import softmax

#pip install transformers
#pytorch related
import torch
import torch.nn as nn
import torch.nn.functional as F

#bert related
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

#self-defined
from dataProcessing_bert_service import RFADataProcessForPred as DP
import utils_bert_service as ut
from utils_bert_service import flat_accuracy,flat_accuracy, format_time, set_seed, train_batch, evaluate_batch, save_model, plot_train
import sys
sys.path.insert(0,'..')
from rfa_bert import GrantModel

torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7f0eec285ed0>

In [2]:
#for calling the file from terminal 
#in the order that they will be used
parser = argparse.ArgumentParser(description = 'BERT model for Grant recommendation, service mode')
parser.add_argument('-data_path', type = str, default = '../newdata/', 
                    help = 'complete path to the rfa data  [default:newdata/]')
parser.add_argument('-cv_path', type = str, default = '../../part2_CollaRec/service/', 
                    help = 'complete path to users CV  [default:../../part2_CollaRec/service/, \
                    used initially in collaborator recommendation]')
parser.add_argument('-load_pretrained', type = bool, default = True,
                    help = 'wehther to load pretrained bert embeddings & tokenizer [default:False]')
parser.add_argument('-load_path', type = str, default = '../model_uq/', 
                    help = """path where fine-tuned bert embeddings & tokenizer  
                           are saved [default:../model_uq/, normally should be evalAuto/bert/ 
                           but we are borrowing from trained from other projct: uncertainty quantification]""")
parser.add_argument('-cuda_device', type = int, default = 1, 
                    help = 'if has cuda, device number to be used [default:1]')
#user name
parser.add_argument('-f_name', type = str, default = 'Bijal', 
                    help = "first name of the reseacher, sentence captilization")
parser.add_argument('-m_name', type = str, default = 'A', 
                    help = "middle name of the reseacher, sentence captilization")
parser.add_argument('-l_name', type = str, default = 'Balasubramanian', 
                    help = "last name of the reseacher, sentence captilization")
parser.add_argument('-top', type = int, default = 20, 
                    help = 'number of recommendations (per cluster) [default:20]')
args = parser.parse_args([])

In [3]:
def main(args):
    seed_val = 1234
    set_seed(seed_val) 
    
    # get logger started
    logging.basicConfig(level=logging.ERROR, filename= "logfile", filemode="a+",
                            format="%(asctime)-15s %(levelname)-8s %(message)s")
    logger = logging.getLogger('Grant recommender service')
    handler = logging.FileHandler("logfile")
    logger.addHandler(handler)
    logger.error('Grant recommender service')
    
    try:
        dp =  DP(path1 = args.data_path, path2 = args.cv_path,
                 load_pretrained = args.load_pretrained, load_path = args.load_path, 
                 f_name = args.f_name, m_name = args.m_name, l_name = args.l_name,
                 logger = logger)
        dp.dataframize_()
        test_loader, _ = dp.dataloaderize_() 
        
        if torch.cuda.is_available():
            use_cuda = torch.device('cuda:' + str(args.cuda_device))
        else:
            use_cuda = torch.device('cpu')

        model = GrantModel(load_pretrained = args.load_pretrained, load_path = args.load_path).model
        model.to(use_cuda)
        
        #predictions 
        pred_flat, probas = ut.getPredRes(model = model,
                         test_loader = test_loader,
                         use_cuda = use_cuda,
                         f_name = args.f_name,  l_name = args.l_name)
        
        clustered  = ut.clustered_recom(f_name = args.f_name, m_name = args.m_name, l_name = args.l_name,
                     data_path = args.data_path, logger = logger,
                     top = args.top)
        
        
    except KeyboardInterrupt:
        print(colored('--' * 70, 'green'))
        print(colored('Exiting from training early', 'green'))
        

In [4]:
if __name__ == "__main__":
    main(args)

pairs dataframe loaded
length of the corpus 235392
sample of the corpus ['survey strategies increase participant response rates primary care research studies ', 'survey strategies increase participant response rates primary care research studies ']
length of the corpus 235392
sample of the corpus ['diabetes research training behavioral scientist t32 foster development diverse highly trained workforce behavioral scientist assume leadership role relate nation research effort area type diabetes national institute diabetes digestive kidney disease niddk national institute nursing research ninr invite application establishment institutional research training program develop cadre diverse highly trained behavioral scientist conduct research relevant improve clinical management quality life patient type diabetes.training grant t32 award make eligible institution provide program prepare predoctoral postdoctoral behavioral scientist select institution behavioral research career type diabetes st

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


...DONE.
