## Functionality Summary
In this notebook, we did
* Baseline experiments for NB (with vectors created from TFIDF)
* built the whole vocabulary on the RFA and testing on publications belonging to the test data
* vector representations were chosen based on best-performing on train/validation
* all evaluations are done on the test datasets 

In [5]:
import pickle
import os
import sys
import argparse 
import ast
import pandas as pd
import numpy as np
import logging 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
#from sklearn import cross_validation
# from sklearn.model_selection import RandomizedSearchCV
from scipy import sparse
from scipy.sparse import hstack
from sklearn.metrics import accuracy_score, roc_curve, auc

#local 
import utils_bsl as ut

In [6]:
#in the order that they will be used
parser = argparse.ArgumentParser(description = 'NB classifier (with tfidf) for Grant recommendation')
parser.add_argument('-data_path', type = str, default = 'newdata/', 
                    help = 'complete path to the training data [default:newdata/]')
parser.add_argument('-load_pretrained', type = bool, default = False,
                    help = 'whether to load pretrained NB related vectors/models [default:False]')
parser.add_argument('-load_path', type = str, default = 'evalAuto/nb/', 
                    help = 'path where nb related vectors/models are saved [default:evalAuto/nb/]')
# some training parameters regarding NB 
parser.add_argument('-ngram_range', type = str, default = '(1,2)', help = 'see sklearn TFIDF params')
parser.add_argument('-min_df', type = int, default = 2, help = 'see sklearn TFIDF params')
parser.add_argument('-max_features', type = int, default = 2000, help = 'see sklearn TFIDF params')
parser.add_argument('-alpha', type = int, default = 0.5, help = 'see sklearn NB params')
parser.add_argument('-top', type = int, default = 10, 
                    help = 'number of recommendations to take [default:10]')
args = parser.parse_args([])

In [7]:
def main(args):
    seed_val = 1234
    ut.set_seed(seed_val) 
    
    # get logger started
    logging.basicConfig(level=logging.ERROR, filename= args.load_path + "logfile", filemode="a+",
                            format="%(asctime)-15s %(levelname)-8s %(message)s")
    logger = logging.getLogger('NB for grant')
    handler = logging.FileHandler(args.load_path + "logfile")
    logger.addHandler(handler)
    logger.error('NB for grant')
    
    try:
        nb = Pipeline(steps =[('vectorizer', TfidfVectorizer(ngram_range = ast.literal_eval(args.ngram_range), 
                                                             min_df = args.min_df,
                                                             max_features = args.max_features)), 
                              ('mnb', MultinomialNB(alpha = args.alpha))])
        # train, valid and test 
        rfas, pubs, mix_df, \
        train_idx, valid_idx, test_idx, \
        train_citation, valid_citation, citation, \
        train_mixed, valid_mixed, test_mixed = ut.load_data(args.data_path)

        train_X, train_y, train_pmids, train_rfaids = ut.select_data_nb(idx = train_idx, mix_df = mix_df, \
                                                                        pubs = pubs, rfas = rfas,\
                                                                        outpath = args.load_path, idx_name = 'train_',\
                                                                        load_pretrained = args.load_pretrained)
        valid_X, valid_y, valid_pmids, valid_rfaids = ut.select_data_nb(idx = valid_idx, mix_df = mix_df, \
                                                                        pubs = pubs, rfas = rfas,\
                                                                        outpath = args.load_path, idx_name = 'valid_',\
                                                                        load_pretrained = args.load_pretrained)
        test_X, test_y, test_pmids, test_rfaids = ut.select_data_nb(idx = test_idx, mix_df = mix_df,\
                                                                    pubs = pubs, rfas = rfas,\
                                                                    outpath = args.load_path, idx_name = 'test_',\
                                                                    load_pretrained = args.load_pretrained)

        # train and predic
        nb.fit(train_X, train_y)
        train_y_probas  = nb.predict_proba(train_X)
        valid_y_probas  = nb.predict_proba(valid_X)
        test_y_probas  = nb.predict_proba(test_X)
        
        train_dict, _ = ut.create_smilarity_dict(pmids = train_pmids, rfaids = train_rfaids, 
                                             combine_predictions_probas = train_y_probas, \
                                             save_path = args.load_path, idx_name = 'train_')
        valid_dict, _ = ut.create_smilarity_dict(pmids = valid_pmids, rfaids = valid_rfaids, 
                                             combine_predictions_probas = valid_y_probas, \
                                             save_path = args.load_path, idx_name = 'valid_')
        test_dict, _ = ut.create_smilarity_dict(pmids = test_pmids, rfaids = test_rfaids, 
                                             combine_predictions_probas = test_y_probas, \
                                             save_path = args.load_path, idx_name = 'test_')
        # evaluation on train and test 
        logger.error('=======train statistics======')
        train_auc = ut.mini_auc(train_y, train_y_probas)
        logger.error('train auc = {}'.format(train_auc))
        ut.print_metrics(citation = train_citation, similarity_dict = train_dict, logger = logger, ks = [1, 5])
        print('=========================================')
        logger.error('=======test statistics======')
        test_auc = ut.mini_auc(test_y, test_y_probas)
        logger.error('test auc = {}'.format(test_auc))
        ut.print_metrics(citation = citation, similarity_dict = test_dict, logger = logger, ks = [1, 5])
        logging.shutdown()
        for handler in logger.handlers:
            if isinstance(handler, logging.FileHandler):
                handler.close()
                
    except KeyboardInterrupt:
        print(colored('--' * 70, 'green'))
        print(colored('Exiting from training early', 'green'))

In [8]:
if __name__ == "__main__":
    main(args)

length of the corpus 310092
sample of the corpus ['clinical response dorsal duct drainage via minor papilla refractory obstruct chronic calcific pancreatitis background study aim complete stone removal main pancreatic duct might achieve patient obstructive chronic calcific pancreatitis report result endoscopic dorsal pancreatic duct dpd bypass obstruct stone ventral pancreatic duct vpd patient method patient obstructive chronic calcific pancreatitis treat dpd bypass clinical success define significant pain relief hospital admission pain management ongoing treatment period result among patient meet entry criterion 62.5 history unsuccessful endoscopic therapy fail extracorporeal shockwave lithotripsy eswl clinical success achieve patient among responder patient 83.3 markedly improve complete pain relief first stent placement persist throughout follow-up period patient 91.7 able discontinue daily analgesic conclusion select patient obstructive chronic calcific pancreatitis dpd bypass cons

length of the corpus 310092
sample of the corpus ['clinical response dorsal duct drainage via minor papilla refractory obstruct chronic calcific pancreatitis background study aim complete stone removal main pancreatic duct might achieve patient obstructive chronic calcific pancreatitis report result endoscopic dorsal pancreatic duct dpd bypass obstruct stone ventral pancreatic duct vpd patient method patient obstructive chronic calcific pancreatitis treat dpd bypass clinical success define significant pain relief hospital admission pain management ongoing treatment period result among patient meet entry criterion 62.5 history unsuccessful endoscopic therapy fail extracorporeal shockwave lithotripsy eswl clinical success achieve patient among responder patient 83.3 markedly improve complete pain relief first stent placement persist throughout follow-up period patient 91.7 able discontinue daily analgesic conclusion select patient obstructive chronic calcific pancreatitis dpd bypass cons