This implementation is from group 14, composed by:

- Ana Evans 86379
- Artur Guimaraes 86389
- Francisco Rosa 86417


This Notebook showcases the functional part of the first delivery. In each section we  present the function and a set of outputs. After each funtion we will mention the structure and the meaning of each input and output. Alternatively, you can include a standard funtion signature


In [1]:
import os, os.path
import re
import sys
import time
import nltk
import spacy
import whoosh
import shutil
import sklearn
import math
import numpy as np
import matplotlib as mpl 
import matplotlib.pyplot as plt
from heapq import nlargest 
from bs4 import BeautifulSoup
from lxml import etree
from whoosh import index
from whoosh import scoring
from whoosh.qparser import *
from whoosh.fields import *
from sklearn.metrics import *
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from textblob import TextBlob

topics = {}
judged_documents = {}
index_id = 1
# -----------------------------------------------------------------------
# getTopics - Auxiliary function that gathers info on all topics
#
# Input: directory - Directory path for project materials
# 
# Behaviour: Extracts topic info from '{directory}topics.txt' and updates
# the global dictionary which stores topic info
#
# Output: None
# -----------------------------------------------------------------------
def getTopics(directory):
    global topics
    
    topic_f = open('{}topics.txt'.format(directory), 'r')
    parsed_file = BeautifulSoup(topic_f.read(), 'lxml')

    topic_list = parsed_file.find_all('top')

    for topic in topic_list:
        split_topic = topic.getText().split('\n')
        split_topic = list(filter(lambda x: x!='', split_topic))

        number = split_topic[0].split(' ')[2][1:]
        title = processing(split_topic[1])
        topics[int(number)] = re.sub(' +',' ',title)  
    return

# -------------------------------------------------------------------------------------------------
# get_R_set - Auxiliary function that extracts the R set
#
# Input: directory - Directory path for project materials
# 
# Behaviour: Extracts the triplet (Topic id, Document id, Feedback) for each entry in the 
# R set, present in '{directory}qrels_test.txt' (R-test) and '{directory}qrels_test.txt' (R-train)
#
# Output: [R-Test, R-Train], each being a list of triplet entries
# -------------------------------------------------------------------------------------------------
def get_R_set(directory):
    global judged_documents

    r_test_f = open('{}qrels_test.txt'.format(directory), 'r')
    r_train_f = open('{}qrels_train.txt'.format(directory), 'r')

    r_test_lines = r_test_f.readlines()
    r_train_lines = r_train_f.readlines()

    r_test_lines = [r_test_lines, r_train_lines]
    r_set = [{},{}]
    
    for i in range(2):
        for line in r_test_lines[i]:
            split_entry = line.split(' ')
            topic_id = int(split_entry[0][1:])
            doc_id = int(split_entry[1])

            if doc_id not in judged_documents: 
                judged_documents[doc_id] = True

            feedback = int(split_entry[2])
            
            if topic_id not in r_set[i]:
                r_set[i][topic_id] = {}
            r_set[i][topic_id][doc_id] = feedback

    return r_set

#--------------------------------------------------
# get_xml_files_recursively - Auxiliary function to get_files_from_directory
#
# Input: path - The path to the parent directory or file from which to start our recursive function
#               
# Behaviour: Creates a list with the path to every file that's an hierarquical child of parent directory path,
# recursively going through each child in Post-Order traversing
#
# Output: A List with the paths to each file child
#--------------------------------------------------
def get_xml_files_recursively(path):
    global judged_documents

    files_list = []
    directory_list = os.listdir(path)
    for f in directory_list:
        n_path = '{}{}/'.format(path,f)
        if os.path.isdir(n_path):
            files_list.extend(get_xml_files_recursively(n_path))
        else:
            files_list.append(re.sub('//','/','{}/{}'.format(path,f)))
    return files_list

# -------------------------------------------------
# get_files_from_directory - Recursively gets all files from directory or file path, parsing the files from xml to objects
# and spliting them in D_Test and D_Train in the conditions specified by our project
#
# Input: path - The path to the parent directory or file from which to start our search
#               
# Behaviour: It starts by creating a list with the path to every file that's an hierarquical child of parent directory path,
# recursively going through each child in Post-Order traversing. Afterwards it parses each and every file from xml to a runtime
# object using the BeautifulSoup library. At last after having all files in object form it splits the dataset in D_Test and D_Train
# sets, according to their identifier (D_Test -> identifier > 1996-09-30   D_Train -> identifier <= 1996-09-30)
#
# Output: A List with the Lists of file objects present in D_Test and D_Train
# -------------------------------------------------
def get_files_from_directory(path):
    file_list = get_xml_files_recursively(path)

    parsed_files_test = []
    parsed_files_train = []

    for f in file_list:
        date_identifier = int(f.split('/')[2])

        open_file = open(f, 'r')
        parsed_file = BeautifulSoup(open_file.read(), 'lxml')
        
        if parsed_file.copyright != None:
            parsed_file.copyright.decompose()

        if parsed_file.codes != None:
            parsed_file.codes.decompose()
              
        if date_identifier <= 19960930:
            parsed_files_train += [parsed_file,]
        else:
            parsed_files_test += [parsed_file,]

    return (parsed_files_test, parsed_files_train)

# -----------------------------------------------------------------------------------------------------------
# processing - Processes text in String form
#
# Input: text - The text in String form to be processed
#        **kwargs - Optional named arguments, with the following functionality (default values prefixed by *)
#               lowercasing [*True | False]: Flag to perform Lowercasing 
#               punctuation [*True | False]: Flag to remove punction
#               spellcheck [True | *False]: Flag to perform spell check using TextBlob
#               stopwords [*True | False]: Flag to remove Stop Words 
#               simplication [*lemmatization | stemming | None]: Flag to perform Lemmatization or Stemming
#               
# Behaviour: Procceses the text in the input argument text as refered to by the arguments in **kwargs,
# behaviour being completely dependent on them except for Tokenization which is always performed
#
# Output: A String with the processed text 
# ----------------------------------------------------------------------------------------------------------
def processing(text, **kwargs):

    p_text = text
    # Lowercasing the entire string
    if 'lowercasing' not in kwargs or kwargs['lowercasing']:
        p_text = p_text.lower()

    #Remove punctuation
    if 'punctuation' not in kwargs or kwargs['punctuation']:
        p_text = re.sub("[/-]"," ",p_text)
        p_text = re.sub("[.,;:\"\'!?`´()$£€]","",p_text)

    # Spell Check
    if "spellcheck" in kwargs and kwargs['spellcheck']:          
        p_text = str(TextBlob(p_text).correct())

    # Tokenization
    tokens = nltk.word_tokenize(p_text)
    string_tokens = ''

    # Spell Check correction
    if "spellcheck" in kwargs and kwargs['spellcheck']:
        n_tokens = []
        for word in tokens:           
            n_tokens += ' {}'.format(TextBlob(word).correct)

    # Lemmatization
    if 'simplification' not in kwargs or kwargs['simplification'] == 'lemmatization':
        lemma = WordNetLemmatizer()

        #Remove stopwords
        if 'stopwords' not in kwargs or kwargs['stopwords']:
            for word in tokens:
                if word not in stopwords.words('English'):   
                    string_tokens += ' {}'.format(lemma.lemmatize(word))
        else: 
            for word in tokens: 
                string_tokens += ' {}'.format(lemma.lemmatize(word))

    # Stemming
    elif kwargs['simplification'] == 'stemming':
        stemer = nltk.stem.snowball.EnglishStemmer()

        #Remove stopwords
        if 'stopwords' not in kwargs or kwargs['stopwords']:
            for word in tokens:
                if word not in stopwords.words('English'):   
                    string_tokens += ' {}'.format(stemer.stem(word))
        else: 
            for word in tokens: 
                string_tokens += ' {}'.format(stemer.stem(word))

    # Case for no simplification
    else:
        for word in tokens: 
            string_tokens += ' {}'.format(word)   

    # Removing the first whitespace in the output 
    return string_tokens[1:]

# -------------------------------------------------------------------------------------------------------------------
# boolean_query_aux - Auxiliary function to boolean_query that will check repeated ocurrences of documents
#
# Input: document_lists - A List of Lists in which each inner List has all documents in which the n-th term appeared
#        k - The number of terms we are using
#
# Behaviour: The function starts by calculating our error margin, in other words the number of missmatches a document
# can have before we stop considering it as relevant. This function composes a very simple algorithmn, where for each
# document we find in a sublist (non repeated, we use the list 'seen' to check that) we check if it's contained within 
# all other sublists, until it's not contained in miss_m + 1 lists. When that's the case, the document is no longer 
# relevant and we move on to the next one, iterating upon all elements of all sublists. The Time Complexity of this 
# function is O(N^2) while the Space Complexity is O(N)
#
# Output: A List of all relevants docs that don't exceed miss_m missmatches
# -------------------------------------------------------------------------------------------------------------------
def boolean_query_aux(document_lists, k):
    miss_m = round(0.2*k)
    seen = []
    result_docs = []

    for term_docs in document_lists:
        for doc in term_docs:
            if doc not in seen:
                chances = miss_m
                flag = True
                for doc_list in document_lists:
                    if doc not in doc_list:
                        if chances == 0:
                            flag = False
                            break
                        chances -= 1
                if flag:
                    result_docs += [doc,]
                seen += [doc, ]

    result_docs.sort()
    return result_docs

# ------------------------------------------------------------------------------------------
# cosine_scoring - Function that scores a document based on cosine similarity 
#
# Input: searcher - The searcher associated with the index I
#        all the other arguments are built-ins from FunctionWeighting() and old whoosh.scoring
#        documentation
#
# Behaviour: Uses the tf-idf result from searcher.idf() and applies cosine similarity formula
# to it
#
# Output: cosine similarity weight vector formula 
# ------------------------------------------------------------------------------------------
def cosine_scoring(searcher, fieldnum, text, docnum, weight, QTF=1):
    idf = searcher.idf(fieldnum, text)

    DTW = (1.0 + math.log(weight)) * idf
    QMF = 1.0
    QTW = ((0.5 + (0.5 * QTF/ QMF))) * idf
    return DTW * QTW


# -------------------------------------------------------------------------------------------------
# reciprocal_rank_fusion - Auxiliary function to calculate the RRF for the top-p documents
# Uses the formula RBF_score(f) = sum (1 / (50 + rank_f))
# -------------------------------------------------------------------------------------------------
def reciprocal_rank_fusion(p, ranking_lists):
    document_ranks = {}

    for rank_l in ranking_lists:
        for i in range(len(rank_l )):
            if rank_l[i][0] not in document_ranks:
                document_ranks[rank_l[i][0]] = 0
            document_ranks[rank_l[i][0]] += 1 / (50 + i+1)

    p_highest = None

    if p != None:
        p_highest = nlargest(p, document_ranks, key=document_ranks.get)
    else:
        p_highest = nlargest(len(document_ranks), document_ranks, key=document_ranks.get)
    
    results = []

    for p in p_highest:
        results += [[p, document_ranks[p]]]  

    return results
# -------------------------------------------------------------------------------------------------
# find_R_test_labels - Function that finds the test labels for a given R_Set
#
# Input: R_test - The R_Test set 
#
# Behaviour: Extrapolates the feedback from the R_Test set to an array
#
# Output: The R_Test set labels in np array form
# -------------------------------------------------------------------------------------------------
def find_R_test_labels(R_test):
    r_labels = {}
    for doc in R_test:
        r_labels[doc] = R_test[doc]

    return r_labels

# -------------------------------------------------------------------------------------------------
# find_ranked_query_labels - Function that finds the test labels for given query_docs and r_labels
#
# Input: query_docs - The ranked query docs
#        r_labels - the labels R_Test set produced 
#
# Behaviour: Compares de R_Test set feedback with the ranked docs
#
# Output: The labels for the ranked query docs in np array form
# -------------------------------------------------------------------------------------------------
def find_ranked_query_labels(query_docs, r_labels):
    q_docs = np.array(query_docs)
    q_docs = q_docs[:,0]

    query_labels = []
    result_labels = []

    for doc in query_docs:
        if doc[0] in r_labels:
            query_labels += [[doc[0], 1], ]
            result_labels += [[doc[0], r_labels[doc[0]]], ]
    
    for doc in r_labels:
        if doc not in q_docs:
            query_labels += [[doc, 0], ]
            result_labels += [[doc, r_labels[doc]], ]
    

    return [np.array(query_labels), np.array(result_labels)]  

# -------------------------------------------------------------------------------------------------
# find_boolean_query_labels - Function that finds the test labels for given query_docs and r_labels
#
# Input: query_docs - The query docs
#        r_labels - the labels R_Test set produced 
#
# Behaviour: Compares the R_Test set feedback with the ranked docs
#
# Output: The labels for the query docs in np array form
# -------------------------------------------------------------------------------------------------
def find_boolean_query_labels(query_docs, r_labels):
    query_labels = []
    result_labels = []

    for doc in r_labels:
        if doc in query_docs:
            query_labels += [[doc, 1], ]
            result_labels += [[doc, r_labels[doc]]]
        else:
            query_labels += [[doc, 0], ]
            result_labels += [[doc, r_labels[doc]]]

    return [np.array(query_labels), np.array(result_labels)]   

# -------------------------------------------------------------------------------------------------
# bpref - Function that runs the bpref evaluation metric
# -------------------------------------------------------------------------------------------------
def bpref(sol_labels):
    R = 0
    N = 0
    bpref = 0
    n_count = 0
    for label in sol_labels:
        if label == 0:
            N += 1
        else:
            R += 1

    for label in sol_labels:
        if label == 0:
            n_count += 1
        else:
            bpref += (1 - n_count/(min(R,N)))

    return (1 / R) * bpref

# -------------------------------------------------------------------------------------------------
# evaluate_ranked_query - Auxiliary function to calculate statistical data
# -------------------------------------------------------------------------------------------------
def evaluate_ranked_query(topic, o_labels, sol_labels, **kwargs):
    results = {}

    results['accuracy'] = accuracy_score(sol_labels, o_labels)
    results['precision-micro'] = precision_score(sol_labels, o_labels, average='micro', zero_division=1)
    results['precision-macro'] = precision_score(sol_labels, o_labels, average='macro', zero_division=1)
    results['recall-micro'] =  recall_score(sol_labels, o_labels, average='micro')
    results['recall-macro'] =  recall_score(sol_labels, o_labels, average='macro')
    results['f-beta-micro'] = fbeta_score(sol_labels, o_labels, average='micro', beta=0.5)
    results['f-beta-macro'] = fbeta_score(sol_labels, o_labels, average='macro', beta=0.5)
    results['MAP'] = average_precision_score(sol_labels, o_labels)
    results['BPREF'] = bpref(sol_labels)

    if 'curves' in kwargs and kwargs['curves']:
        precision, recall, _ = precision_recall_curve(sol_labels, o_labels)
        PrecisionRecallDisplay(precision=precision, recall=recall).plot()
        plt.title('Precision Recall curve for Ranked topic {}'.format(topic))
        plt.show()

    return results

# -------------------------------------------------------------------------------------------------
# evaluate_boolean_query - Auxiliary function to calculate statistical data
# -------------------------------------------------------------------------------------------------
def evaluate_boolean_query(topic, o_labels, sol_labels, **kwargs):
    results = {}

    results['accuracy'] = accuracy_score(sol_labels, o_labels)
    results['precision-micro'] = precision_score(sol_labels, o_labels, average='micro', zero_division=1)
    results['precision-macro'] = precision_score(sol_labels, o_labels, average='macro', zero_division=1)
    results['recall-micro'] =  recall_score(sol_labels, o_labels, average='micro')
    results['recall-macro'] =  recall_score(sol_labels, o_labels, average='macro')
    results['f-beta-micro'] = fbeta_score(sol_labels, o_labels, average='micro', beta=0.5)
    results['f-beta-macro'] = fbeta_score(sol_labels, o_labels, average='macro', beta=0.5)
    results['MAP'] = average_precision_score(sol_labels, o_labels)

    
    if 'curves' in kwargs and kwargs['curves']:
        precision, recall, _ = precision_recall_curve(sol_labels, o_labels)
        PrecisionRecallDisplay(precision=precision, recall=recall).plot()
        plt.title('Precision Recall curve for Boolean topic {}'.format(topic))
        plt.show()


    return results

# -------------------------------------------------------------------------------------------------
# display_results - Auxiliary function to display calculated statistical data
# -------------------------------------------------------------------------------------------------
def display_results_per_q(q, results_ranked, results_boolean):
    print("\nResult for search on Topic {}".format(q))
    print("\nRanked Search:")
    for p in results_ranked:
        result_str= ''
        for m in results_ranked[p]:
            result_str += '{} = {}, '.format(m, round(results_ranked[p][m],4)) 
        print("For p={}: {}".format(p, result_str[:-2]))

    print("\nBoolean Search:")
    for k in results_boolean:
        result_str= ''
        for m in results_boolean[k]:
            result_str += '{} = {}, '.format(m, round(results_boolean[k][m],4)) 
        print("For k={}: {}".format(k, result_str[:-2]))

    return
# --------------------------------------------------------------------------------------------
# overlapping_terms() - Function that finds the overlapping terms for a given k range
#
# Input: 
#
# Behaviour: Queries the top terms for all k's in a given k range and checks them for overlap
#
# Output: Data about the overlaping terms
# --------------------------------------------------------------------------------------------
def overlapping_terms():
    I = index.open_dir("index_judged_docs_dir", indexname='index_judged_docs')
    k_range = [2,3,5,7,10,15]

    for k in k_range:
        top_terms = {}
        for q in range(101,201,1):
            results = extract_topic_query(q, I, k)
            for r in results:
                if r not in top_terms:
                    top_terms[r] = 0
                top_terms[r] += 1

        r_terms = 0
        for term in top_terms:
            if top_terms[term] > 1:
                r_terms += 1
        print("\nNumber of overlapping terms: {}".format(r_terms))
        print("Percent of overlapping terms: {}%".format(round(r_terms/len(top_terms)*100,3)))
        print(top_terms)
    return

The previous section composes the enterity of our code that's not directly tied to the 5 main functions or testing on them, including every auxiliary function.

In [2]:
def indexing(D, **kwargs):
    global index_id

    start_time = time.time()
    ind_name = 'index{}'.format(str(index_id))
    ind_dir = '{}_dir'.format(ind_name)

    if os.path.exists(ind_dir):
        shutil.rmtree(ind_dir)
        os.mkdir(ind_dir)
    else:
        os.mkdir(ind_dir)

    schema = Schema(id= NUMERIC(stored=True), content= TEXT(stored=True))
    ind = index.create_in(ind_dir, schema=schema, indexname=ind_name)
    ind_writer = ind.writer()

    if not index.exists_in(ind_dir, indexname=ind_name):
        print("Error creating index")
        return

    for doc in D:
        item_id = doc.newsitem.get('itemid')
        title = processing(re.sub('<[^<]+>', "", str(doc.title)), **kwargs)
        dateline = processing(re.sub('<[^<]+>|\w[0-9]+-[0-9]+-[0-9]+\w', "", str(doc.dateline)), **kwargs)
        text = processing(re.sub('<[^<]+>', "", str(doc.find_all('text')))[1:-1], **kwargs)
        
        result = nltk.word_tokenize('{} {} {}'.format(title, dateline, text))
        ind_writer.add_document(id=item_id, content=result)

    ind_writer.commit()
    
    time_required = round(time.time() - start_time, 6)
    
    space_required = os.path.getsize(ind_dir)

    return (ind, time_required, space_required)

# Change this to given rcv1 directory. We've removed the folders that are not relevant to the project.
D_set = get_files_from_directory("../rcv1/19960820/")
I =  indexing(D_set[0])
print(I)

# Baseline index for experiments with entire D_set
# I = index.open_dir("index_judged_docs_dir", indexname='index_judged_docs') 

(FileIndex(FileStorage('index1_dir'), 'index1'), 0.018958, 4096)


**indexing(D, kwargs)** - Creates an index after processing all text on data set D

**Input:** (D, kwargs)

        D - The data set we will be building the index with 
        
        kwargs - Optional named arguments for text preprocessing, with the following functionality (default values prefixed by *)
               lowercasing [*True | False]: Flag to perform Lowercasing 
               punctuation [*True | False]: Flag to remove punction
               spellcheck [True | *False]: Flag to perform spell check using TextBlob
               stopwords [*True | False]: Flag to remove Stop Words 
               simplication [*lemmatization | stemming | None]: Flag to perform Lemmatization or Stemming
               
 **Behaviour:** This function starts by creating the directory for our Index, after initializing our Schema fields. It then
 processes all documents on data set D and stores valuable information from them on the index (identifier, title, dateline and text).
 At last it commits the resulting processed documents to our index and calculates the total computational time the function used and the
 Disk space required to store the index.

 **Output:** A triplet tuple with the Inverted Index in object structure, the computational time for the function and 
 the disk space required to store the Inverted Index 

In [3]:
def extract_topic_query(q, I, k, **kwargs):
    global topics 
    topic = topics[q]

    topic_terms = []
    weight_vector = None

    # Chooses which score model to use from kwargs
    if 'scoring' not in kwargs:
        weight_vector = scoring.BM25F(B=0.75, content_B=1.0, K1=1.5)

    elif kwargs['scoring'] == 'freq':
        weight_vector = scoring.Frequency()

    elif kwargs['scoring'] == 'tf-idf':
        weight_vector = scoring.TF_IDF()

    elif kwargs['scoring'] == 'dfree':
        weight_vector = scoring.DFree()

    elif kwargs['scoring'] == 'pl2':
        C = 1.0 if 'C' not in kwargs else kwargs['C']

        weight_vector = scoring.PL2(c=C)

    elif kwargs['scoring'] == 'bm25':
        b = 0.75 if 'B' not in kwargs else kwargs['B']
        content_b = 1.0 if 'content_B' not in kwargs else kwargs['content_B']
        k1 = 1.5 if 'K1' not in kwargs else kwargs['K1']

        weight_vector = scoring.BM25F(B=b, content_B=content_b, K1=k1)    

    with I.searcher(weighting=weight_vector) as searcher:
        parser = QueryParser("content", I.schema, group=OrGroup).parse(topic)
        results = searcher.search(parser, limit=None)
        res_list = [int(r.values()[1]) for r in results]

        numbers_list = []
        for i in res_list:
            numbers_list += [searcher.document_number(id=i),]

        topic_terms = searcher.key_terms(numbers_list, "content", numterms=k, normalize=True)
      
    result = []
    for term in topic_terms:
        result += [term[0], ]

    return result

# This material folder is kept within the project directory 
material_dic = 'material/'

getTopics(material_dic)
print(extract_topic_query(120,I,10))

['death', 'council', 'accident', 'yearly', 'people', 'simpson', 'result', 'mining', 'mine', 'half']


 **extract_topic_query(q,I,k,kwargs)** - Return the top-k informative terms from the topic q agains I using parameterizable scoring

 **Input:** (q,I,k,kwargs)
 
        q - The identifier number of the topic we want to search about
 
        I - The Index object in which we will perform our search
        
        k - The number of top-k terms to return
        
        kwargs - Optional named arguments to parameterize scoring, with the following functionality (default values prefixed by *)
               scoring [freq | tf-idf | dfree | pl2 |*bm25] - Chooses the scoring model we will use to score our terms
               C [float | *1.0] - Free parameter for the pl2 model
               B [float | *0.75] - Free parameter for the BM25 model
               content_B [float | *1.0] - Free parameter specific to the content field for the BM25 model
               k1 [float | *1.5] - Free parameter for the BM25 model

 **Behaviour:** Extracting the relevant model information from kwargs, this function uses the index I present in its arguments 
 to perform a scored search on the top-k informative terms for topic q. It does so by creating a QueryParser object to parse
 the entire lenght of terms from q we've stored in our global topics structure and by using searcher.key_terms() to return
 the top terms according to our scoring weight vector. 

 **Output:** A List that contains the top k terms 

In [4]:
def boolean_query(q, k, I, **kwargs):
    terms = extract_topic_query(q, I, k, **kwargs)

    document_lists = []
    with I.searcher() as searcher:
        for term in terms:
            parser = QueryParser("content", I.schema, group=OrGroup).parse(term)
            results = searcher.search(parser, limit=None)
            term_list = [int(r.values()[1]) for r in results]
            document_lists += [term_list,]
            
    return boolean_query_aux(document_lists, k)

print(boolean_query(120,5,I))

[188920, 198038, 212755, 213354, 325202, 326277, 367280, 367978, 383110, 385019, 480287, 523104, 557332, 558329]


 **boolean_query(q,k,I,kwargs)** - Function that will query all documents in index I and find those who contain
 all top k-terms relevant to topic q allowing up to round(0.2*k) missmatches 

 **Input:** (q,k,I,kwargs)
 
        q - The identifier number of the topic we want to search about
 
        k - The number of top k-terms to check documents for
        
        I - The Index object in which we will perform our search
        
        kwargs - Optional arguments refer to extract_topic_query() and are described as follows:
               scoring [freq | tf-idf | dfree | pl2 |*bm25] - Chooses the scoring model we will use to score our terms
               C [float | *1.0] - Free parameter for the pl2 model
               B [float | *0.75] - Free parameter for the BM25 model
               content_B [float | *1.0] - Free parameter specific to the content field for the BM25 model
               k1 [float | *1.5] - Free parameter for the BM25 model

 **Behaviour:** The function starts by running extract_topic_query to return top k-terms with which
 we will search for the relevant docs for topic q. Then we use the index I to perform a simple
 search on, parsing the result of our search per term to our auxiliary function. 

 **Output:** A List of all relevants docs that don't exceed miss_m missmatches

In [5]:
def ranking(q, p, I, **kwargs):
    global topics
    topic = topics[q]

    weight_vector = None
    if 'ranking' not in kwargs:
        weight_vector = scoring.BM25F(B=0.75, content_B=1.0, K1=1.5)

    elif kwargs['ranking'] == 'cosine':
        weight_vector = scoring.FunctionWeighting(cosine_scoring)

    elif kwargs['ranking'] == 'tf-idf':
        weight_vector = scoring.TF_IDF()

    elif kwargs['ranking'] == 'bm25':
        b = 0.75 if 'B' not in kwargs else kwargs['B']
        content_b = 1.0 if 'content_B' not in kwargs else kwargs['content_B']
        k1 = 1.5 if 'K1' not in kwargs else kwargs['K1']

        weight_vector = scoring.BM25F(B=b, content_B=content_b, K1=k1)  

    elif kwargs['ranking'] == 'RRF':
        bm25_ranking_1 = ranking(q, p, I, ranking="bm25")
        bm25_ranking_2 = ranking(q, p, I, ranking="bm25", b=0.5, content_b=1.25, k1=1.25)
        bm25_ranking_3 = ranking(q, p, I, ranking="bm25", b=0.5, content_b=1.5, k1=1.00)

        return reciprocal_rank_fusion(p, [bm25_ranking_1, bm25_ranking_2, bm25_ranking_3])

    with I.searcher(weighting=weight_vector) as searcher:
        parser = QueryParser("content", I.schema, group=OrGroup).parse(topic)
        results = searcher.search(parser, limit=p)
        
        term_list = []

        if p != None:
            for i in range(p):
                if i < len(results):
                    term_list += [(int(results[i].values()[1]), results.score(i)), ]
        else:
            for i in range(len(results)):
                term_list += [(int(results[i].values()[1]), results.score(i)), ]

    return term_list

print(ranking(120,50,I))

[(584459, 28.254201392850014), (278616, 28.192934774700113), (404182, 27.12153764290254), (792492, 25.461922516243582), (171226, 25.19692728128043), (204747, 23.034778800657648), (123337, 22.999708352384996), (378363, 22.930812645277832), (486353, 22.521280927841886), (568999, 21.639495077237466), (190362, 21.060485320547304), (136594, 20.389647463670762), (436113, 20.11444001536727), (520094, 19.202343229641027), (705685, 18.857761187469187), (435901, 18.681432936218037), (693723, 18.561516348232836), (107933, 18.371357184750785), (519806, 18.293917399506192), (522470, 18.293917399506192), (397346, 18.180028882713714), (387471, 18.148527604028523), (772032, 18.137292100900307), (348003, 18.109250909798007), (359847, 18.013298772517736), (105063, 17.858644194621604), (95224, 17.84700265832251), (277667, 17.692147598392992), (190374, 17.547728297565968), (705688, 17.45938080146687), (110616, 17.30748587669453), (774533, 17.2400351818085), (790186, 17.144101698753467), (101115, 17.098811

 **ranking(q,p,I,kwargs)** - Function that will query all documents in index I and rank the top p ones

 **Input:** (q,p,I,kwargs)
 
        q - The identifier number of the topic we want to search about 
        
        p - The number of top ranked documents we will return
        
        I - The Index object in which we will perform our search
        
        kwargs - Optional named arguments to parameterize scoring, with the following functionality (default values prefixed by *)
               ranking [cosine | RRF | tf-idf | *bm25] - Chooses the scoring model we will use to score our terms
               B [float | *0.75] - Free parameter for the BM25 model
               content_B [float | *1.0] - Free parameter specific to the content field for the BM25 model
               k1 [float | *1.5] - Free parameter for the BM25 model

 **Behaviour:** The function uses the weight vector generated by its given scoring system to search and rank  
 the top-p documents in the index according to the full topic text.

 **Output:** A List of lists that contains pairs [document_id, score] in descending score ordering

In [6]:
def evaluation(Q_test, R_test, D_test, **kwargs):
    I = index.open_dir("index_judged_docs_dir", indexname='index_judged_docs')
    #I = indexing(D_test, **kwargs)[0]

    results_ranked = {}
    results_boolean = {}
    k_range = [1,2,4,6,8,10]
    p_range = [100,200,300,400,500, None]

    if 'k_range' in kwargs:
        k_range = kwargs['k_range']
    if 'p_range' in kwargs:
        p_range = kwargs['p_range']

    for q in Q_test:
        r_labels = find_R_test_labels(R_test[q])

        for p in p_range:
            score_docs = ranking(q, p, I, **kwargs)
            ranked_labels = find_ranked_query_labels(score_docs, r_labels)

            results_ranked[p] = evaluate_ranked_query(q, ranked_labels[0][:, 1],ranked_labels[1][:, 1], **kwargs)

        for k in k_range:
            boolean_docs = boolean_query(q, k, I, **kwargs)
            query_labels = find_boolean_query_labels(boolean_docs, r_labels)

            results_boolean[k] = evaluate_boolean_query(q, query_labels[0][:, 1], query_labels[1][:, 1], **kwargs)
            
        display_results_per_q(q, results_ranked, results_boolean)
        
    return

Q_test = [120]
R_set = get_R_set(material_dic)
evaluation(Q_test, R_set[0], D_set[0])


Result for search on Topic 120

Ranked Search:
For p=100: accuracy = 0.7205, precision-micro = 0.7205, precision-macro = 0.7857, recall-micro = 0.7205, recall-macro = 0.6414, f-beta-micro = 0.7205, f-beta-macro = 0.6898, MAP = 0.534, BPREF = 0.5999
For p=200: accuracy = 0.7301, precision-micro = 0.7301, precision-macro = 0.7491, recall-micro = 0.7301, recall-macro = 0.6675, f-beta-micro = 0.7301, f-beta-macro = 0.7052, MAP = 0.5427, BPREF = 0.6068
For p=300: accuracy = 0.6627, precision-micro = 0.6627, precision-macro = 0.6368, recall-micro = 0.6627, recall-macro = 0.6216, f-beta-micro = 0.6627, f-beta-macro = 0.6303, MAP = 0.4669, BPREF = 0.5417
For p=400: accuracy = 0.559, precision-micro = 0.559, precision-macro = 0.5393, recall-micro = 0.559, recall-macro = 0.5404, f-beta-micro = 0.559, f-beta-macro = 0.5391, MAP = 0.4021, BPREF = 0.4543
For p=500: accuracy = 0.5518, precision-micro = 0.5518, precision-macro = 0.5476, recall-micro = 0.5518, recall-macro = 0.5504, f-beta-micro = 0.

 **evaluation(Qtest ,Rtest ,Dtest ,args)** - Function that fully evaluates our IR model, providing full statiscal analysis for several
 p and k values across multiple ranges and topics

 **Input:** (Qtest ,Rtest ,Dtest ,args)
 
        Q_test - The set of topics we will evaluate the perform of our IR model on
        
        R_test - The number of top ranked documents we will return
        
        D_test - The Index object in which we will perform our search
        
        kwargs - The additional args in this function also refer to the additional args in indexing(),
        ranking() and boolean_query(), for which documentation is provided above. Other than that, we have:
               k_range [list of ints | *[1,2,4,6,8,10, 15]] - List of k values our model will test
               p_range [list of ints or None | *[100,200,300,400,500, None]] - List of p values our model will test
               curves [True | *False] - Display the precision/recall curves

 **Behaviour:** The function provides full statistics for every topic in Q_test, using R_test and D_test
 to build an index. Then, for each p in p_range it will use ranking() to rank the top p documents
 and for each k in k_range it will use k to evaluate the relevant docs using boolean_query(). In the end,
 it uses retrival results to provide full statiscal analysis.

 **Output:** Full statistical analysis for the provided input args

In [7]:
# A List of some examples of using optional args with our code

print(extract_topic_query(120,I,20, scoring='freq'))
print(extract_topic_query(121,I,20, scoring='tf-idf'))
print(extract_topic_query(122,I,20, scoring='pl2'))
print(extract_topic_query(123,I,20, scoring='bm25', B=0.25, content_B= 1.25, k1=1.25))

print(ranking(150, 20, I, ranking='bm25', B=3, content_B= 0.25, k1=0.25))
print(ranking(150, 20, I, ranking='bm25', B=0.25, content_B= 3, k1=2))
print(ranking(150, 20, I, ranking='RRF'))

Q_test = [105,120]
evaluation(Q_test, R_set[0], D_set[0], ranking='RRF', scoring='tf-idf', k_range=[2,3,4,5], p_range=[100,200,300,400,1000])
evaluation(Q_test, R_set[0], D_set[0], ranking='bm25', scoring='freq', k_range=[6,7,8,9], p_range=[500,600,700])

['death', 'council', 'accident', 'yearly', 'people', 'simpson', 'result', 'mining', 'mine', 'half', 'directive', 'possibly', 'killed', 'agenda', 'proposal', 'may', 'police', 'dec', 'brussels', 'said']
['china', 'nuclear', 'pakistan', 'beijing', 'chinese', 'hong', 'council', '0', 'kong', 'taiwan', 'missile', 'korea', 'india', 'power', '*', 'north', 'state', 'south', 'official', 'u']
['disease', 'council', 'agenda', 'bse', 'directive', 'european', 'cow', 'brussels', 'possibly', 'proposal', 'drug', 'eu', 'beef', 'hold', 'mad', 'patient', 'cattle', 'health', 'luxembourg', 'human']
['newspaper', 'percent', 'year', 'said', 'decline', 'labour', 'party', 'daily', '*', 'conservative', 'china', 'government', 'would', 'poll', 'election', 'last', 'time', 'report', 'month', 'minister']
[(250177, 16.188240126228926), (608919, 15.351559299148075), (650471, 15.199293586677655), (558586, 15.138922185783136), (401374, 14.63466917650236), (254857, 14.281682874793805), (808812, 14.101009870589053), (53180