## Functionality Summary
In this notebook, we did
* Baseline experiments for BM25 
* built the whole vocabulary on the RFA and testing on publications belonging to the test data
* all evaluations are done on the test datasets 

In [1]:
import pickle
import os
import sys
import argparse 
import ast
import pandas as pd
import numpy as np
import logging 
#from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.metrics.pairwise import linear_kernel
from gensim import corpora
from gensim.summarization import bm25

#local 
import utils_bsl as ut

In [2]:
#in the order that they will be used
parser = argparse.ArgumentParser(description = 'BM25 for Grant recommendation')
parser.add_argument('-data_path', type = str, default = 'newdata/', 
                    help = 'complete path to the training data [default:newdata/]')
parser.add_argument('-load_pretrained', type = bool, default = True,
                    help = 'whether to load pretrained BM25 embeddings & corpus & vectorizer [default:False]')
parser.add_argument('-load_path', type = str, default = 'evalAuto/bm25/', 
                    help = 'path where bm25 embeddings & corpus & vectorizers are saved [default:evalAuto/bm25/]')
parser.add_argument('-top', type = int, default = 10, 
                    help = 'number of recommendations to take [default:10]')
args = parser.parse_args([])

In [3]:
def main(args):
    seed_val = 1234
    ut.set_seed(seed_val) 
    
    # get logger started
    logging.basicConfig(level=logging.ERROR, filename= args.load_path + "logfile", filemode="a+",
                            format="%(asctime)-15s %(levelname)-8s %(message)s")
    logger = logging.getLogger('BM25 for grant')
    handler = logging.FileHandler(args.load_path + "logfile")
    logger.addHandler(handler)
    logger.error('BM25 for grant')
    
    try:
        # train, valid and test 
        rfas, pubs, mix_df, \
        train_idx, valid_idx, test_idx, \
        train_citation, valid_citation, citation, \
        train_mixed, valid_mixed, test_mixed = ut.load_data(args.data_path)
        rfa_vecs, rfa_ids, vectorizer, dictionary = ut.process_rfa_corpus_bm25(df = rfas, 
                                                               outpath = args.load_path, \
                                                               load_pretrained = args.load_pretrained)
        train_scores, train_pmids, _ = ut.process_pub_query_bm25(idx = train_idx, mix_df = mix_df, pubs = pubs, 
                                                          vectorizer = vectorizer, dictionary = dictionary,\
                                                          idx_name = 'train_', \
                                                          outpath = args.load_path, \
                                                          load_pretrained = args.load_pretrained)
        valid_scores, valid_pmids, _ = ut.process_pub_query_bm25(idx = valid_idx, mix_df = mix_df, pubs = pubs, 
                                                          vectorizer = vectorizer, dictionary = dictionary,\
                                                          idx_name = 'valid_', \
                                                          outpath = args.load_path, \
                                                          load_pretrained = args.load_pretrained)
        test_scores, test_pmids, _ = ut.process_pub_query_bm25(idx = test_idx, mix_df = mix_df, pubs = pubs, 
                                                          vectorizer = vectorizer, dictionary = dictionary,\
                                                          idx_name = 'test_', \
                                                          outpath = args.load_path, \
                                                          load_pretrained = args.load_pretrained)
        # prediction 
        train_dict = ut.sim_recommend_bm25( corpus_ids = rfa_ids,\
                                           query_scores = train_scores, query_ids = train_pmids, 
                                           mix_dict = train_mixed, mode= 'strict',outpath = args.load_path, \
                                           query_name = 'train_', top= args.top) 
        valid_dict = ut.sim_recommend_bm25( corpus_ids = rfa_ids,\
                                           query_scores = valid_scores, query_ids = valid_pmids, 
                                           mix_dict = valid_mixed, mode= 'strict',outpath = args.load_path, \
                                           query_name = 'valid_', top= args.top) 
        test_dict = ut.sim_recommend_bm25( corpus_ids = rfa_ids,\
                                           query_scores = test_scores, query_ids = test_pmids, 
                                           mix_dict = test_mixed, mode= 'strict',outpath = args.load_path, \
                                           query_name = 'test_', top= args.top)
        
        # evaluation on train and test 
        logger.error('=======train statistics======')
        ut.print_metrics(citation = train_citation, similarity_dict = train_dict, logger = logger, ks = [1, 5])
        print('=========================================')
        logger.error('=======test statistics======')
        ut.print_metrics(citation = citation, similarity_dict = test_dict, logger = logger, ks = [1, 5])
        logging.shutdown()
        for handler in logger.handlers:
            if isinstance(handler, logging.FileHandler):
                handler.close()
                
    except KeyboardInterrupt:
        print(colored('--' * 70, 'green'))
        print(colored('Exiting from training early', 'green'))

In [4]:
if __name__ == "__main__":
    main(args)

MRR:
0.8384266042714946
recall@1, recall@5:
0.655245851364705
0.884258470485342
precision@1, precision@5:
0.6968344294135687
0.6933972050900528
MAP:
0.8384266042714946
MRR:
0.8365500229153748
recall@1, recall@5:
0.6530666163427061
0.8852991130401963
precision@1, precision@5:
0.6935270805812418
0.6932289111153048
MAP:
0.8365500229153748
