# Executive summary

* this notebook contains the experimentations in the publication  " a novel NIH grant recommender using BERT "
* major experimentes conducted
 * create dataset with false pairs 
 * train, validation and test 
 * for manual evaluations, refer to manualEval & corresponding folders

In [None]:
#general 
import os
import argparse
import pickle
import dill
import logging

#you cannot live without 
from tqdm import trange
import pandas as pd
import numpy as np
import time
#import matplotlib.pyplot as plt
import random
from termcolor import colored
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import average_precision_score, roc_auc_score
from scipy.special import softmax

#pip install transformers
#pytorch related
import torch
import torch.nn as nn
import torch.nn.functional as F

#bert related
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

#self-defined
from dataProcessing_bert_newSplit import RFADataProcess as DP
import utils_bert as ut
from utils_bert import flat_accuracy,flat_accuracy, format_time, set_seed, train_batch, evaluate_batch, save_model, plot_train
from rfa_bert import GrantModel
from eval_metrics import Metrics

torch.autograd.set_detect_anomaly(True)

In [None]:
#for calling the file from terminal 
#in the order that they will be used
parser = argparse.ArgumentParser(description = 'BERT model for Grant recommendation')
parser.add_argument('-data_path', type = str, default = 'newdata/', 
                    help = 'complete path to the training data [default:newdata/]')
parser.add_argument('-load_pretrained', type = bool, default = False,
                    help = 'wehther to load pretrained bert embeddings & tokenizer [default:False]')
parser.add_argument('-load_path', type = str, default = 'evalAuto/bert/', 
                    help = """path where fine-tuned bert embeddings & tokenizer  
                           are saved [default:model_bert_save_newSplit/]""")
parser.add_argument('-newSplit', type = bool, default = True, 
                    help = """whether split the data for MRR calculation [default: True]""")
parser.add_argument('-cuda_device', type = int, default = 1, 
                    help = 'if has cuda, device number to be used [default:1]')
parser.add_argument('-learning_rate', type = float, default = 2e-5, 
                    help = 'learning rate of Bert model optimizer [default:2e-5]')
parser.add_argument('-epsilon', type = float, default = 1e-8, 
                    help = 'epsilon of Bert model optimizer [default:1e-8]')
parser.add_argument('-train_epochs', type = int, default = 4, 
                    help = 'fine tune epoch numbers [default: 4]')
parser.add_argument('-plot_train', type = bool, default = True, 
                    help = 'Whether to plot training stats [default: True]')
args = parser.parse_args([])

In [None]:
def main(args):
    seed_val = 1234
    set_seed(seed_val) # we'll need to deal with this for ensemble methods: also pytorch initialization are random 
    try:
        #print('data preparing...')
        dp =  DP(path= args.data_path,
                  load_pretrained = args.load_pretrained, 
                  load_path = args.load_path,
                  newSplit = args.newSplit)
        dp.dataframize_()
        #then to dataloader 
        train_loader, valid_loader, test_loader, _ = dp.dataloaderize_() #dataloader right here  
        if torch.cuda.is_available():
            use_cuda = torch.device('cuda:' + str(args.cuda_device))
        else:
            use_cuda = torch.device('cpu')
        #model loading
        model = GrantModel(load_pretrained = args.load_pretrained, load_path = args.load_path).model
        model.to(use_cuda)
        
        #optimizer and scheduler
        optimizer = AdamW(model.parameters(),
                          lr = args.learning_rate,
                          eps = args.epsilon)

        # Create the learning rate scheduler.
        total_steps = len(train_loader) * args.train_epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                    num_warmup_steps = 0, # Default value in run_glue.py
                                                    num_training_steps = total_steps)
        #train and valid and save
        training_stats, model = ut.train(epochs = args.train_epochs, 
                                         model = model,
                                         train_loader = train_loader, 
                                         valid_loader = valid_loader, 
                                         optimizer = optimizer, 
                                         scheduler = scheduler, 
                                         use_cuda = use_cuda,
                                         tokenizer = dp.tokenizer,
                                         args = args)

        if args.plot_train:
            ut.plot_train(training_stats, args.load_path)

        #prediction on test
        combine_predictions, combine_true_labels = ut.predictions(model = model, 
                                                                  test_loader = test_loader, 
                                                                  use_cuda = use_cuda, 
                                                                  path = args.load_path)
        

        citation_df = pd.read_csv(args.data_path + 'citation_data_test.csv')
        similarity_dict, max_leng = ut.create_smilarity_dict(citation_df = citation_df, 
                                                             combine_predictions = combine_predictions, 
                                                             save_path = args.load_path)
        p = softmax(combine_predictions, axis =1)
        prob1 = p[:,1]
        ap = average_precision_score(combine_true_labels,prob1)
        auc = roc_auc_score(combine_true_labels, prob1)
        print('AUC = {} and AP = {}'.format(auc, ap))

        print('MRR:')
        print(Metrics(dp.citation).calculate_mrr(similarity_dict)) #mrr

        print('recall@1, recall@5:')
        print(Metrics(dp.citation).calculate_recall_at_k(similarity_dict, 1))
        print(Metrics(dp.citation).calculate_recall_at_k(similarity_dict, 5))

        print('precision@1, precision@5:')
        print(Metrics(dp.citation).calculate_precision_at_k(similarity_dict, 1))        
        print(Metrics(dp.citation).calculate_precision_at_k(similarity_dict, 5))

        print('MAP:')
        print(Metrics(dp.citation).calculate_MAP_at_k(similarity_dict))
        
    except KeyboardInterrupt:
        print(colored('--' * 70, 'green'))
        print(colored('Exiting from training early', 'green'))
        

In [None]:
if __name__ == "__main__":
    main(args)