# IFN647 Week 7 Workshop

## Task 2 
Read a topic-doc-assignment file (e.g. relevance_judgements.txt, the benchmark) and a retrieved topic-doc-assignment file (e.g., binary_output.txt, the output of an IR model for query R105) and calculate the IR model's Recall, Precision and F-measure (F1).

- Please download two topic-doc-assignment files (rel_data.zip) and save them as a folder (e.g. "rel_data"). Both files have the format of 

Topic | DocumentID | Relevance 

For the file "relevance_judgements.txt", Relevance (Relevance Judgement) = "1" indicates relevant and "0" means non-relevant. For the file "binary_output.txt", the relevance values are generated by the IR model. We can obtain a set of retrieved documents by selecting thev rows with Relevance (Relevance Values) = "1". 

- Define a function rel_setting(inputpath), which reads the two topic-doc-assignment files in the folder `inputpath` and returns a pair of dictionaries {documentID: Relevance_judgement, ...} and {document:Relevance_value, ...} for all documents in "relevance_judgements.txt" and "binary_output.txt", respectively
- Define a main function to call function `rel_setting()`, calculate Recall, Precision and F-measure and display the result

In [58]:
# Importing the relevant libraries
import os

# Important variables
curr_dir = os.getcwd()

In [None]:
def open_file(file_path):
    open_file = open(file_path, 'r')
    lines = open_file.readlines()
    open_file.close()
    return lines

def rel_setting(inputpath): 
    relevance_values = {}
    judgement_values = {}

    target_folder = os.path.join(curr_dir, inputpath)

    folder_items = os.listdir(target_folder)
    for file in folder_items:
        full_file_path = os.path.join(target_folder, file)

        # Skipping the ranked output file
        if file != 'ranked_output.txt':
            lines = open_file(full_file_path)
        if file == 'relevance_judgments.txt':
            for line in lines:
                line = line.replace('\n', '')
                line = line.split(' ')
                relevance_values[line[1]] = line[2]
        if file == 'binary_output.txt': 
            for line in lines:
                line = line.replace('\n', '')
                line = line.split(' ')
                judgement_values[line[1]] = line[2]           

    return relevance_values, judgement_values



In [None]:
# Calculating the Recall, Precision and F-measures 
def main_function_task2(inputpath):
    relevance_results, judgement_results = rel_setting(inputpath)

    relevant_docs = 0

    for value in relevance_results.items():
        if value[1] == '1':
            relevant_docs += 1

    judgement_relevant_docs = 0
    for value in judgement_results.items():
        if value[1] == '1':
            judgement_relevant_docs += 1

    print(f'The number of relevant documents: {relevant_docs}')
    print(f'The nuumber of retrieved documents: {len(judgement_results)}')
    print(f'The number of retrieved documents that are relevant: {judgement_relevant_docs}')

    recall = judgement_relevant_docs / relevant_docs
    print(f'Recall = {recall}')
    precision = judgement_relevant_docs / len(judgement_results)
    print(f'Precision = {precision}')
    f_measure = (2 * precision * recall) / (precision + recall)
    print(f'F-measure = {f_measure}')
    return 

main_function_task2('rel_data')

The number of relevant documents: 16
The nuumber of retrieved documents: 37
The number of retrieved documents that are relevant: 14
Recall = 0.875
Precision = 0.3783783783783784
F-measure = 0.5283018867924528


## Task 3:
Read the ranked output file ("ranked_output.txt", the ranked output of another IR model for query R105), and calculate the average precision 

- Please download the ranked output file and save it in the same folder you created for Task 2
- Extend function rel_setting(inputpath) to return 3 dictionaries (two for task 2 and one for task 3). The dictionary for task 3 should have the format of {rankingNo: documentID, ...} and it only includes top-10 documents (ranked in descending order), where rankingNo = 1, 2, ..., 10.
- Extend the main function to calculate Recall and Precision at the rank positions where a relevant document was retrieved, and then calculate the average precision and print out the result

In [None]:
task_3_file = os.path.join(curr_dir, 'rel_data', 'ranked_output.txt')

ranked_output_file = open_file(task_3_file)

def rel_setting_task_3(inputpath): 
    relevance_values = {}
    judgement_values = {}
    sorted_ranking = {}
    document_ranking = {}

    target_folder = os.path.join(curr_dir, inputpath)

    folder_items = os.listdir(target_folder)
    for file in folder_items:
        full_file_path = os.path.join(target_folder, file)
        lines = open_file(full_file_path)
        # Skipping the ranked output file
        if file == 'ranked_output.txt':
            for line in lines:
                line = line.replace('\n', '')
                line = line.split(' ')

                sorted_ranking[line[1]] = line[2]

            # Sorting ther document ranking in descending order
            sorted_ranking = dict(sorted(sorted_ranking.items(), key=lambda item: item[1], reverse=True)[:10])
            i = 1
            for item, value in sorted_ranking.items():
                document_ranking[i] = item
                i += 1
        if file == 'relevance_judgments.txt':
            for line in lines:
                line = line.replace('\n', '')
                line = line.split(' ')
                relevance_values[line[1]] = line[2]
        if file == 'binary_output.txt': 
            for line in lines:
                line = line.replace('\n', '')
                line = line.split(' ')
                judgement_values[line[1]] = line[2]           

    return relevance_values, judgement_values, document_ranking

In [123]:
relevance_results, judgement_results, document_ranking = rel_setting_task_3('rel_data')
relevance_results

{'2493': '1',
 '2494': '1',
 '3008': '1',
 '5004': '1',
 '5223': '0',
 '5225': '1',
 '5226': '1',
 '6635': '0',
 '7937': '0',
 '15744': '1',
 '17650': '0',
 '22961': '0',
 '25263': '0',
 '26711': '0',
 '27966': '0',
 '28198': '0',
 '31141': '0',
 '33961': '0',
 '38229': '0',
 '40239': '1',
 '40259': '1',
 '48148': '1',
 '48587': '0',
 '49633': '1',
 '50202': '0',
 '51493': '1',
 '52175': '0',
 '59813': '0',
 '62213': '0',
 '64966': '0',
 '65210': '0',
 '80118': '0',
 '80483': '0',
 '80484': '1',
 '80884': '1',
 '86042': '1',
 '86961': '1'}

In [None]:
def main_function_task3(inputpath):
    relevance_results, judgement_results, document_rankings = rel_setting_task_3(inputpath)

    relevant_docs = 0

    for value in relevance_results.items():
        if value[1] == '1':
            relevant_docs += 1

    judgement_relevant_docs = 0
    for value in judgement_results.items():
        if value[1] == '1':
            judgement_relevant_docs += 1
    recall = judgement_relevant_docs / relevant_docs
    precision = judgement_relevant_docs / len(judgement_results)
    f_measure = (2 * precision * recall) / (precision + recall)

    ranked_relevant_docs = 0

    for rank, doc in document_rankings.items():
        print(judgement_results[doc])

    # for value in document_rankings.items():
    #     print(value)
    # for rank, doc in document_rankings.items():
    #     print(f'At position {rank}, docID: {doc}, precision = {precision}')

    


In [127]:
main_function_task3('rel_data')

{'2493': '1', '2494': '0', '3008': '1', '5004': '0', '5223': '0', '5225': '1', '5226': '1', '6635': '0', '7937': '0', '15744': '1', '17650': '0', '22961': '0', '25263': '0', '26711': '0', '27966': '0', '28198': '0', '31141': '1', '33961': '0', '38229': '0', '40239': '1', '40259': '0', '48148': '1', '48587': '0', '49633': '1', '50202': '0', '51493': '1', '52175': '0', '59813': '1', '62213': '0', '64966': '0', '65210': '1', '80118': '0', '80483': '0', '80484': '1', '80884': '0', '86042': '0', '86961': '1'}
{'2493': '1', '2494': '0', '3008': '1', '5004': '0', '5223': '0', '5225': '1', '5226': '1', '6635': '0', '7937': '0', '15744': '1', '17650': '0', '22961': '0', '25263': '0', '26711': '0', '27966': '0', '28198': '0', '31141': '1', '33961': '0', '38229': '0', '40239': '1', '40259': '0', '48148': '1', '48587': '0', '49633': '1', '50202': '0', '51493': '1', '52175': '0', '59813': '1', '62213': '0', '64966': '0', '65210': '1', '80118': '0', '80483': '0', '80484': '1', '80884': '0', '86042':