# IFN647 Week 9 Workshop

In [3]:
import os
import numpy as np
from stemming.porter2 import stem
import string

## Task 1: Feature Selection from Training Set D

Design a python program to evaluate terms weights (i.e. define function w5) using the above procedure; and then select top-terms (e.g., their weights are greater than the mean terms' weights + theta), where theta is an experimental parameter.

You can firstly design a function w5 using the following:

Inputs: "Training_set" folder, "Training_benchmark.txt" (see week8 workshop) and theta
Output: A dictionary of features with their w5 weights

Then in the main program, save the features to a text file (Model_w5_R102.dat).

In [4]:
def eval_term_weights(inputfolder, training_benchmark):
    curr_dir = os.getcwd()
    
    d_minus = []
    d_plus = []

    ## Generating the d-minus dataset
    with open(training_benchmark, 'r') as f:
        lines = f.readlines()
    
        for line in lines:
            line = line.strip()
            line_string = line.split()

            if line_string[2] == '1':
                d_plus.append(line_string[1])
            if line_string[2] == '0':
                d_minus.append(line_string[1])

    D = os.listdir(f'{curr_dir}/Training_set')
    N = len(D)
    n_t = {}
    R = len(d_plus)
    r_t = {}
    
    # Loading stopwords
    stopwords_file = open('common-english-words.txt', 'r')
    stop_words_list = stopwords_file.readlines()
    stopwords_file.close()
    stop_words_list = stop_words_list[0].split(',')

    start_end = False

    # Populating the r_t dictionary
    for file_ in d_plus:
        parsed_text = [] 

        open_file = open(f'{curr_dir}/Training_set/{file_}.xml')
        file_contents = open_file.readlines()

        for line in file_contents:
            line = line.strip()

            if line.startswith('<text>'):
                start_end = True
            if line.startswith('<newsitem '):
                for part in line.split():
                    if part.startswith('itemid='):
                        docid = part.split('=')[1].split('/')[0]
                        docid = docid.replace('"', '')
            elif line.startswith('<p>'):
                line = line.replace('<p>', '').replace('</p>', '')
                line = line.replace('-', ' ')
                line = line.translate(str.maketrans('', '', string.punctuation))
                line = line.replace('quot', '')
            elif line.startswith('</text>'):
                start_end = False
            if start_end:
                parsed_text.append(line)
        
        split_text = []

        for line in parsed_text:
            for word in line.split(): 

                if word.lower() not in stop_words_list and not word.isdigit():
                    word = word.lower()
                    split_text.append(stem(word))

        split_text.remove('<text>')
        seen_within_doc = {}

        for term in split_text:
            if term not in seen_within_doc:
                seen_within_doc[term] = True
                if term not in r_t:
                    r_t[term] = {}

                r_t[term][file_] = 1

    # Populating the n_t dictionary
    for file_ in D:
        parsed_text = []

        open_file = open(f'{curr_dir}/Training_set/{file_}')
        file_contents = open_file.readlines()

        for line in file_contents:
            line = line.strip()

            if line.startswith('<text>'):
                start_end = True
            if line.startswith('<newsitem '):
                for part in line.split():
                    if part.startswith('itemid='):
                        docid = part.split('=')[1].split('/')[0]
                        docid = docid.replace('"', '')
            elif line.startswith('<p>'):
                line = line.replace('<p>', '').replace('</p>', '')
                line = line.replace('-', ' ')
                line = line.translate(str.maketrans('', '', string.punctuation))
                line = line.replace('quot', '')
            elif line.startswith('</text>'):
                start_end = False
            if start_end:
                parsed_text.append(line)
        
        split_text = []

        for line in parsed_text:
            for word in line.split(): 

                if word.lower() not in stop_words_list and not word.isdigit():
                    word = word.lower()
                    split_text.append(stem(word))

        split_text.remove('<text>')
        seen_within_doc = {}

        for term in split_text:
            if term not in seen_within_doc:
                seen_within_doc[term] = True
                if term not in n_t:
                    n_t[term] = {}

                n_t[term][file_] = 1

    w5_weights = {}
    all_terms = set(n_t.keys()) | set(r_t.keys())

    for term in all_terms:
        n_t_len = len(n_t.get(term, {}))
        r_t_len = len(r_t.get(term, {}))

        try:
            numerator = (r_t_len + 0.5) * (N - n_t_len + 0.5)
            denominator = (n_t_len - r_t_len + 0.5) * (R - r_t_len + 0.5)

            w5 = np.log(numerator / denominator)
            w5_weights[term] = w5
        except:
            w5_weights[term] = 0 


    return w5_weights
        

curr_dir = os.getcwd()
parsed = eval_term_weights(f'{curr_dir}/Test_set', f'{curr_dir}/Training_benchmark.txt')
print(parsed)

# Turn the parsed dictionary into a descending list
sorted_parsed = sorted(parsed.items(), key=lambda x: x[1], reverse=True)
print(sorted_parsed)

# Save the sorted list to a dat file
with open('Model_w5_R102.dat', 'w') as f:
    for term, weight in sorted_parsed:
        f.write(f'{term} {weight}\n')



{'regul': np.float64(0.9399324949311955), 'shock': np.float64(2.510726156717843), 'eastern': np.float64(1.7320921812291274), 'wake': np.float64(0.7626552172331246), 'three': np.float64(1.9947588547219024), 'suspicion': np.float64(-0.384588421788928), 'censorship': np.float64(2.38463247891996), 'friend': np.float64(1.9399057642036588), 'lisbet': np.float64(-1.1930777549208689), 'sceptic': np.float64(-0.6760792720477973), 'save': np.float64(1.9092353512745046), 'spot': np.float64(2.0681344707651443), 'prosecutor': np.float64(1.7895301962478962), 'finish': np.float64(2.843591067633864), 'johan': np.float64(3.2898025274974616), 'approv': np.float64(1.9092353512745046), 'exclaim': np.float64(1.5305793375217807), 'regular': np.float64(0.16216996899387798), 'latest': np.float64(0.41985384556026406), 'occas': np.float64(0.42579418974659), 'pressur': np.float64(0.6208059933181094), 'moment': np.float64(1.7386496439141066), 'didier': np.float64(2.38463247891996), 'suscept': np.float64(-1.1930777

## Task 2: 
Rank Documents in "Test_set" folder (U) Based on Features (the outcome of Task 1) to test the BM25 Model

1. Design an algorithm "procedure BM25Testing(Features, U)", to calculate BM25 ranking score for all documents in U and return a dictionary of {doc1:rank1, ...}.

2. Design a Python Function to implement procedure BM25Testing(Features, U) and save the ranking result in a file name "rankBM25.txt"

1. Designing the Algorithm

procedure BM25Testing(Features, U)<br>
// U is the set of incoming documents (or testing set)<br>
    Rel = {}<br>
    for d in U do { <br>
        rank(d) = 0<br>
        for all t_k in T do { <br>
            if T(t_k, d) = 1 then rank(d) + Features[term]<br>
        }<br>
        Rel[d] = rank(d)<br>
    }<br>
    Return Rel<br>

2. Implementation to Python File

In [8]:
def bm25_testing(Features, coll):
    rank = {}
    for id, doc in coll.get_docs().items():
        rank = 0
        for term in Features.keys():
            if term in doc.get_term_list():
                try:
                    rank[id] += parsed[term]
                except KeyError:
                    rank[id] = Features[term]
    return rank 

## Task 3
Open a ranking result file (e.g., "rankBM25.txt"), and then calculate it's Average Precision. Design a python program to:

1. Calculate Recall and Precision at rank positions where a relevant document was retrieved
2. Calculate the average precision