This notebook runs the system specific experiments in "A Robust and Efficient Stopping Criteria for Systematic Reviews Using Poisson Processes." 

In [1]:
# IMPORT LIBRARIES
import numpy as np
import pandas as pd
import math
from scipy.stats import poisson
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import operator
import random


# IMPORT EXPERIEMENTAL FUNCTIONS
from utils.read_data_fns import *
from utils.target_method_fns import *  
from utils.knee_method_fns import *   
from utils.inhomogeneous_pp_fns import *  
from utils.eval_fns import *

In [2]:
# SET UP

# READ TOPIC RELEVANCE DATA
with open('data/relevance/qrel_abs_test.txt', 'r') as infile:
    qrels_data = infile.readlines()

    
# CHOOSE SPECIFIC RUN
with open('data/runs2017_table3/Waterloo/B-rank-normal.txt', 'r') as infile:
    run_data = infile.readlines()

    
# MAKE RANK AND RELEVANCE DICTIONARIES
doc_rank_dic = make_rank_dic(run_data)  # make dictionary of ranked docids for each queryid
query_rel_dic = make_rel_dic(qrels_data) # make dictionary of list of docids relevant to each queryid
rank_rel_dic = make_rank_rel_dic(query_rel_dic,doc_rank_dic) # make dic of list relevances of ranked docs for each queryid


# RANDONLY SPLIT TOPICS INTO TRAIN AND TEST SETS
topics_list = make_topics_list(doc_rank_dic,1)  # sort topics by no docs
random.seed(1)
random.shuffle(topics_list)
topics_train = topics_list[0:10]
topics_test = topics_list[10:]
print("N. train topics:",len(topics_train), "N. test topics:",len(topics_test))
test_docs_total = np.sum([len(doc_rank_dic[query_id])for query_id in topics_test])
print("Total test docs:", test_docs_total)


# SET FIXED PARAMETERS 
n_windows = 10  # number of windows to male from sample for PP
des_recalls = [0.5,0.6,0.7, 0.8, 0.9, 0.95]
knee_target_ratio = 6 # knee method rho (Cormack and Grossman set to 6)


# PP PARAMETERS TO BE TUNED
sample_props_list = [[0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,
                0.7,0.75,0.8,0.85,0.9,0.95,1], 
                    [0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,
                0.7,0.75,0.8,0.85,0.9,0.95,1],
                     [0.6,0.65,
                0.7,0.75,0.8,0.85,0.9,0.95,1]  
                    ]  # proportion of docs to sample
min_rel_in_sample_list = [5,20,50] # min number rel docs must be initial sample to proceed with algorithm 
pp_adjusts = []
for sample_props in sample_props_list:
    for min_rel_in_sample in min_rel_in_sample_list:
        #adjust = "ms:"+str(sample_props[0])+" mr:"+str(min_rel_in_sample)
        adjust = [sample_props,min_rel_in_sample]
        pp_adjusts.append(adjust)


        
# KM PARAMETERS TO BE TUNED
km_adjusts = [0, 50, 100, 150, 200]#, 50, 100, 150, 200] # adjustments to target ratio

N. train topics: 10 N. test topics: 20
Total test docs: 78019


In [3]:
# KNEE METHOD TUNING FUNCTIONS

# FN TO LOOP OVER RUNS, IMPLEMENTING KNEE METHOD FOR EACH ADJUSTMENT VALUE
def do_knee_method(topics, adjusts, des_recall):  # des recall only used to name dictionary key
 
    # LOOP OVER QUERIES
    score_dic = {}

    for query_id in topics:
        score_dic[query_id] = []

        # EXTRACT COUNTS AND REL LISTS
        n_docs = len(doc_rank_dic[query_id])  # total n. docs in topic
        rel_list = rank_rel_dic[query_id]  # list binary rel of ranked docs   


        # KNEE METHOD
        batches = get_batches(n_docs)

        for adjust in adjusts:
            knee, knee_stop = get_knee_stopping_point_var_adjust(rel_list, batches, knee_target_ratio, adjust)[0:2]
            knee_recall = calc_recall(rel_list, knee_stop)
            knee_effort = knee_stop
            knee_accept = calc_accept(knee_recall, des_recall)
            score_dic[query_id].append((knee_recall, knee_effort, knee_accept))

    rel_vec_dict = {}
    eff_vec_dict = {}
        
    for i, adjust in enumerate(adjusts):    
        rel_vec_dict[str(adjust)+" rel"] = [val[i][2] for val in score_dic.values()]
        eff_vec_dict[str(adjust)+" eff"] = [val[i][1] for val in score_dic.values()]

    df_score_dic = {} 
    for key in rel_vec_dict.keys():
         df_score_dic[key] = calc_reliability(rel_vec_dict[key])
    for key in eff_vec_dict.keys():    
           df_score_dic[key] = np.sum(eff_vec_dict[key])
    
    df = pd.DataFrame.from_dict(df_score_dic, orient='index',columns = [des_recall])
    df = df.T
    df = df.round(2)
    
    return df

# FN TO CALCULATE BEST KM ASJUSTMENT VALUE
def get_best_knee_adjust(knee_low_rel_adjustments, des_prob, des_recall, df):
    
    end_scores = df.loc[des_recall].to_dict()

    over_effs = {}
    under_accepts = {}

    for adjust in knee_low_rel_adjustments[::-1]:
        if end_scores[str(adjust)+" rel"] >= des_prob:
            over_effs[end_scores[str(adjust)+" eff"]] = adjust
        else: 
            under_accepts[end_scores[str(adjust)+" rel"]]  = adjust

    if len(over_effs) > 0:
        min_eff = min(over_effs.keys())
        best_adjust = over_effs[min_eff]

    else:
        max_accept = max(under_accepts.keys())
        best_adjust = under_accepts[max_accept]
        
    return [best_adjust]


# POISSON PROCESS TUNING FUNCTIONS

# FN TO LOOP OVER RUNS, IMPLEMENTING PP FOR EACH PARAMETER SET
def do_pp_method(topics, adjusts,  des_prob, des_recall):
 
    # LOOP OVER QUERIES
    score_dic = {}

    for query_id in topics:
        score_dic[query_id] = []

        # EXTRACT COUNTS AND REL LISTS
        n_docs = len(doc_rank_dic[query_id])  # total n. docs in topic
        rel_list = rank_rel_dic[query_id]  # list binary rel of ranked docs   

        # INHOMOGENEOUS POISSON PROCESS
        
        for adjust in adjusts:  
            sample_props = adjust[0]
            min_rel_in_sample = adjust[1]
        
            # check topic meets initial relevance requirement
            n_samp_docs = int(round(n_docs*sample_props[0]))
            sample_rel_list = rel_list[0:n_samp_docs]  # chuck of rel list examined in sample

            # if meet size requirement run algorithm; else return n_docs as stopping point
            if (np.sum(sample_rel_list) >= min_rel_in_sample):

                windows_end_point = 0
                pred_stop_n = n_docs
                i = 0

                while (i < len(sample_props)) and (pred_stop_n > n_samp_docs):
                    sample_prop = sample_props[i]

                    n_samp_docs = int(round(n_docs*sample_props[i]))
                    sample_rel_list = rel_list[0:n_samp_docs]  # chuck of rel list examined in sample

                    # get points
                    windows = make_windows(n_windows, sample_prop, n_docs)
                    window_size = windows[0][1]

                    x,y = get_points(windows, window_size, sample_rel_list)  # calculate points that will be used to fit curve

                    good_curve_fit = 0
                    # try to fit curve
                    try: 
                        p0 = [0.1, 0.001]  # initialise curve parameters
                        opt, pcov = curve_fit(model_func, x, y, p0)  # fit curve
                        good_curve_fit = 1
                    except: 
                        pass
                    
                    if(good_curve_fit == 1):
                        a, k = opt
                        y2 = model_func(x, a, k) # get y-values for fitted curve
                        #y2 = a*np.exp(-k*x)


                        # check distance between "curves" are at end sample is suffu
                        n_rel_at_end_samp = np.sum(sample_rel_list)
                        y3 =  model_func(np.array(range(1,len(sample_rel_list)+1)), a, k)
                        pred_by_curve_rel_at_stop = np.sum(y3)
                        pred_by_curve_rel_at_stop = int(round(pred_by_curve_rel_at_stop))       
                        if n_rel_at_end_samp >= des_recall*pred_by_curve_rel_at_stop:

                            # using inhom Poisson process with fitted curve as rate fn, 
                            # predict total number rel docs in topic (subject to min prob)
                            mu = (a/-k)*(math.exp(-k*n_docs)-1)  # integral model_func
                            pred_n_rel = predict_n_rel(des_prob, n_docs, mu) # predict max number rel docs (using poisson cdf)
                            des_n_rel = des_recall*pred_n_rel
                            #pred_stop_n = get_stopping_inhom(des_recall, pred_n_rel, rel_list, n_docs)  # use prev value to pred stopping point
                            if des_n_rel <= n_rel_at_end_samp:
                                pred_stop_n = n_rel_at_end_samp
                                

                    i += 1  # increase sample proportion size


                # score result 
                inhom_recall = calc_recall(rel_list, n_samp_docs)
                inhom_effort = n_samp_docs
                inhom_accept = calc_accept(inhom_recall, des_recall)
                score_dic[query_id].append((inhom_recall, inhom_effort, inhom_accept))
                    
                    
            else: # if not enough docs in topic or not enough rel docs in min sample:

                inhom_stop_n = n_docs  # take stopping point as n_docs
                inhom_recall = calc_recall(rel_list, inhom_stop_n)
                inhom_effort = n_docs
                inhom_accept = calc_accept(inhom_recall, des_recall)
                score_dic[query_id].append((inhom_recall, inhom_effort, inhom_accept))


    rel_vec_dict = {}
    eff_vec_dict = {}

        
    for i, adjust in enumerate(adjusts):    
        rel_vec_dict[str(adjust[0][0])+" "+str(adjust[1])+" rel"] = [val[i][2] for val in score_dic.values()]
        eff_vec_dict[str(adjust[0][0])+" "+str(adjust[1])+" eff"] = [val[i][1] for val in score_dic.values()]

 
    df_score_dic = {} 
    for key in rel_vec_dict.keys():
         df_score_dic[key] = calc_reliability(rel_vec_dict[key])
    for key in eff_vec_dict.keys():    
           df_score_dic[key] = np.sum(eff_vec_dict[key])
    
    df = pd.DataFrame.from_dict(df_score_dic, orient='index', columns = [des_recall])
    df = df.T
    df = df.round(2)
    
    return df


# FN TO CALCULATE BEST PP ASJUSTMENT VALUE
def get_best_pp_adjust(adjusts, des_prob, des_recall, df):
    
    end_scores = df.loc[des_recall].to_dict()

    over_effs = {}
    under_accepts = {}

    for adjust in adjusts[::-1]:
        if end_scores[str(adjust[0][0])+" "+str(adjust[1])+" rel"] >= des_prob:
            over_effs[end_scores[str(adjust[0][0])+" "+str(adjust[1])+" eff"]] = adjust
        else: 
            under_accepts[end_scores[str(adjust)+" rel"]]  = adjust
    
    if len(over_effs) > 0:
        min_eff = min(over_effs.keys())
        best_adjust = over_effs[min_eff]

    else:
        max_accept = max(under_accepts.keys())
        best_adjust = under_accepts[max_accept]
        
    return [best_adjust]


# FUNCTIONS FOR UNTUNED METHODS (OR, BL AND TM)

# FN TO LOOP OVER TOPICS, IMPLEMENTING TARGET METHOD
def do_target_method(topics, des_recall, des_prob):  # des recall only used to name dictionary key
 
    # LOOP OVER QUERIES
    score_dic = {}

    for query_id in topics:
        score_dic[query_id] = []

        # EXTRACT COUNTS AND REL LISTS
        n_docs = len(doc_rank_dic[query_id])  # total n. docs in topic
        rel_list = rank_rel_dic[query_id]  # list binary rel of ranked docs   


        # TARGET METHOD
        random.seed(1)
        target_size = get_target_size(des_recall, des_prob)
        target_list, examined_list = make_target_set(rel_list, n_docs, target_size)  # get target sample and list all docs examined
        tar_stop_n = get_stopping_target(target_list, n_docs, target_size)  # stopping point
        all_examined_idxs = get_all_target_examined_idxs(examined_list, tar_stop_n)  # list of every doc examined during method
        tar_recall = calc_recall(rel_list, tar_stop_n)
        tar_effort = len(all_examined_idxs) # total effort (inc. sampling)
        tar_accept = calc_accept(tar_recall, des_recall)
        score_dic[query_id].append((tar_recall, tar_effort, tar_accept))
    

    rel_vec_dict = {}
    eff_vec_dict = {}
         
    rel_vec_dict["TM rel"] = [val[0][2] for val in score_dic.values()]
    eff_vec_dict["TM eff"] = [val[0][1] for val in score_dic.values()]

    df_score_dic = {} 
    for key in rel_vec_dict.keys():
         df_score_dic[key] = calc_reliability(rel_vec_dict[key])
    for key in eff_vec_dict.keys():    
           df_score_dic[key] = np.sum(eff_vec_dict[key])
    
    df = pd.DataFrame.from_dict(df_score_dic, orient='index',columns = [des_recall])
    df = df.T
    df = df.round(2)
    
    return df



# FN TO LOOP OVER TOPICS, IMPLEMENTING BASELINE METHOD
def do_baseline_method(topics, des_recall): 
 
    # LOOP OVER QUERIES
    score_dic = {}

    for query_id in topics:
        score_dic[query_id] = []

        # EXTRACT COUNTS AND REL LISTS
        n_docs = len(doc_rank_dic[query_id])  # total n. docs in topic
        rel_list = rank_rel_dic[query_id]  # list binary rel of ranked docs   


        # BL METHOD
        bl_stop_n = int(n_docs*des_recall)
        bl_recall = calc_recall(rel_list, bl_stop_n)
        bl_effort = bl_stop_n # total effort (inc. sampling)
        bl_accept = calc_accept(bl_recall, des_recall)
        score_dic[query_id].append((bl_recall, bl_effort, bl_accept))
    

    rel_vec_dict = {}
    eff_vec_dict = {}
         
    rel_vec_dict["BL rel"] = [val[0][2] for val in score_dic.values()]
    eff_vec_dict["BL eff"] = [val[0][1] for val in score_dic.values()]

    df_score_dic = {} 
    for key in rel_vec_dict.keys():
         df_score_dic[key] = calc_reliability(rel_vec_dict[key])
    for key in eff_vec_dict.keys():    
           df_score_dic[key] = np.sum(eff_vec_dict[key])
    
    df = pd.DataFrame.from_dict(df_score_dic, orient='index',columns = [des_recall])
    df = df.T
    df = df.round(2)
    
    return df


# FN TO LOOP OVER TOPICS, IMPLEMENTING ORACLE METHOD
def do_oracle_method(topics, des_recall):  # des recall only used to name dictionary key
 
    # LOOP OVER QUERIES
    score_dic = {}

    for query_id in topics:
        score_dic[query_id] = []

        # EXTRACT COUNTS AND REL LISTS
        n_docs = len(doc_rank_dic[query_id])  # total n. docs in topic
        rel_list = rank_rel_dic[query_id]  # list binary rel of ranked docs   

       # ORACLE
        rel_doc_idxs = np.where(np.array(rel_list) == 1)[0]
        orcale_n_rel = math.ceil(len(rel_doc_idxs)*des_recall)
        oracle_idx = rel_doc_idxs[orcale_n_rel-1]
        oracle_eff = oracle_idx+1
        score_dic[query_id].append(oracle_eff)
      

    eff_vec_dict = {}
         
    eff_vec_dict["OR eff"] = [val[0] for val in score_dic.values()]

    df_score_dic = {} 
    for key in eff_vec_dict.keys():    
           df_score_dic[key] = np.sum(eff_vec_dict[key])
    
    df = pd.DataFrame.from_dict(df_score_dic, orient='index',columns = [des_recall])
    df = df.T
    df = df.round(2)
    
    return df



# FN TO CALCULATE PERCENTAGE OF EFFORT SAVED
def pes(eff): 
    ts = test_docs_total - eff
    ts = round(100*ts/test_docs_total,1)
    return ts


# FN TO CLEAN RESULTS FOR LATEX TABLE
def get_clean_results_dict(df_test, des_prob, name):
    sd = {}
    
    if name == "OR":
        sd[str(name)+" Eff"] = df_test.iloc[0,0]
        sd[str(name)+" PES"] = pes(df_test.iloc[0,0])
        
    else:   
        rel = df_test.iloc[0,0]
        if rel >= des_prob:
            sd[str(name)+" Rel"] = rel
            sd[str(name)+" Eff"] = df_test.iloc[0,1]
            sd[str(name)+" PES"] = pes(df_test.iloc[0,1])
        else:
            sd[str(name)+" Rel"] = rel
            sd[str(name)+" Eff"] = "n/a"
            sd[str(name)+" PES"] = "n/a"
            
            
    if name == "PP":
        tp = df_test.columns[0]  # tuned parameter
        tp = tp[:7].strip()
        sd["PP TP"] = tp
        
    elif name == "KM":
        tp = df_test.columns[0]  # tuned parameter
        tp = tp[:-4].strip()
        sd["KM TP"] = tp
    
    return sd


# fn to run system specific experiments for PP, KM, TM, OR and BL
def run_system_specific_experiments(des_prob):
    
    results = {}
    
    for des_recall in des_recalls:
        df_train = do_pp_method(topics_train, pp_adjusts, des_prob, des_recall)
        best_train = get_best_pp_adjust(pp_adjusts, des_prob, des_recall, df_train)
        df_test =  do_pp_method(topics_test, best_train, des_prob, des_recall)
        sd_pp = get_clean_results_dict(df_test, des_prob, "PP")

        df_train = do_knee_method(topics_train, km_adjusts, des_recall)
        best_train = get_best_knee_adjust(km_adjusts, des_prob, des_recall, df_train)
        df_test =  do_knee_method(topics_test, best_train, des_recall)
        sd_km = get_clean_results_dict(df_test, des_prob, "KM")


        df_test =  do_target_method(topics_test, des_recall, des_prob)
        sd_tm = get_clean_results_dict(df_test, des_prob, "TM")
        df_test =  do_baseline_method(topics_test, des_recall)
        sd_bl = get_clean_results_dict(df_test, des_prob, "BL")
        df_test =  do_oracle_method(topics_test, des_recall)
        sd_or = get_clean_results_dict(df_test, des_prob, "OR")

        sd = {}
        for k,v in sd_pp.items():
            sd[k] = v
        for k,v in sd_km.items():
            sd[k] = v
        for k,v in sd_tm.items():
            sd[k] = v
        for k,v in sd_bl.items():
            sd[k] = v
        for k,v in sd_or.items():
            sd[k] = v
        results[des_recall] = sd

    df = pd.DataFrame.from_dict(results).T
     
    return df


In [4]:
# run experiments for p = 0.95
df_95 = run_system_specific_experiments(0.95)
df_95

Unnamed: 0,PP Rel,PP Eff,PP PES,PP TP,KM Rel,KM Eff,KM PES,KM TP,TM Rel,TM Eff,TM PES,BL Rel,BL Eff,BL PES,OR Eff,OR PES
0.5,1.0,22429,71.3,0.15 5,1.0,21355.0,72.6,50,1,32283,58.6,1,39003,50,2346,97.0
0.6,1.0,22576,71.1,0.15 5,1.0,21355.0,72.6,50,1,38184,51.1,1,46803,40,3558,95.4
0.7,0.95,25064,67.9,0.15 5,1.0,21355.0,72.6,50,1,41015,47.4,1,54602,30,5659,92.7
0.8,1.0,36559,53.1,0.15 5,1.0,21355.0,72.6,50,1,44554,42.9,1,62406,20,7330,90.6
0.9,0.95,62260,20.2,0.15 5,0.85,,,50,1,54511,30.1,1,70205,10,13292,83.0
0.95,0.95,76360,2.1,0.15 5,0.75,,,50,1,66956,14.2,1,74108,5,17392,77.7


In [5]:
# clean df for paper inclusion
df_95 = df_95.drop("PP TP", axis = 1)
df_95 = df_95.drop("KM TP", axis = 1)
cols = ['PP Rel', 'KM Rel','TM Rel', 'BL Rel','PP Eff','KM Eff',  'TM Eff',  
        'BL Eff', 'OR Eff',  'PP PES', 'KM PES','TM PES', "BL PES",'OR PES']
df_95 = df_95[cols]
df_95

Unnamed: 0,PP Rel,KM Rel,TM Rel,BL Rel,PP Eff,KM Eff,TM Eff,BL Eff,OR Eff,PP PES,KM PES,TM PES,BL PES,OR PES
0.5,1.0,1.0,1,1,22429,21355.0,32283,39003,2346,71.3,72.6,58.6,50,97.0
0.6,1.0,1.0,1,1,22576,21355.0,38184,46803,3558,71.1,72.6,51.1,40,95.4
0.7,0.95,1.0,1,1,25064,21355.0,41015,54602,5659,67.9,72.6,47.4,30,92.7
0.8,1.0,1.0,1,1,36559,21355.0,44554,62406,7330,53.1,72.6,42.9,20,90.6
0.9,0.95,0.85,1,1,62260,,54511,70205,13292,20.2,,30.1,10,83.0
0.95,0.95,0.75,1,1,76360,,66956,74108,17392,2.1,,14.2,5,77.7


In [6]:
print(df_95.to_latex())

\begin{tabular}{lllllllllllllll}
\toprule
{} & PP Rel & KM Rel & TM Rel & BL Rel & PP Eff & KM Eff & TM Eff & BL Eff & OR Eff & PP PES & KM PES & TM PES & BL PES & OR PES \\
\midrule
0.50 &      1 &      1 &      1 &      1 &  22429 &  21355 &  32283 &  39003 &   2346 &   71.3 &   72.6 &   58.6 &     50 &     97 \\
0.60 &      1 &      1 &      1 &      1 &  22576 &  21355 &  38184 &  46803 &   3558 &   71.1 &   72.6 &   51.1 &     40 &   95.4 \\
0.70 &   0.95 &      1 &      1 &      1 &  25064 &  21355 &  41015 &  54602 &   5659 &   67.9 &   72.6 &   47.4 &     30 &   92.7 \\
0.80 &      1 &      1 &      1 &      1 &  36559 &  21355 &  44554 &  62406 &   7330 &   53.1 &   72.6 &   42.9 &     20 &   90.6 \\
0.90 &   0.95 &   0.85 &      1 &      1 &  62260 &    n/a &  54511 &  70205 &  13292 &   20.2 &    n/a &   30.1 &     10 &     83 \\
0.95 &   0.95 &   0.75 &      1 &      1 &  76360 &    n/a &  66956 &  74108 &  17392 &    2.1 &    n/a &   14.2 &      5 &   77.7 \\
\bottomrule
\

In [None]:
# run experiments for p = 0.7
df_70 = run_system_specific_experiments(0.7)
df_70

In [None]:
# clean df for paper inclusion
df_70 = df_70.drop("PP TP", axis = 1)
df_70 = df_70.drop("KM TP", axis = 1)
cols = ['PP Rel', 'KM Rel','TM Rel', 'BL Rel','PP Eff','KM Eff',  'TM Eff',  
        'BL Eff', 'OR Eff',  'PP PES', 'KM PES','TM PES', "BL PES",'OR PES']
df_70 = df_70[cols]
df_70

In [None]:
print(df_70.to_latex())