# IFN647 Week 6 Workshop

In this workshop, you are going to design two language models by using inverted indexing structure. For a given query Q, a language model is going to find the relevant documents in a given folder, e.g., a folder “Test_Doc”. Assume the documents in the folder are indexed by the python function index_docs(), i.e., index_docs(“Test_Doc”, stop_words) which returns a dictionary with the following data structure: {term: {docID1: frequency1, DocID2: frequency2, …}, …}

In [1]:
import string 
from stemming.porter2 import stem
import os

## Task 1 

Design function index_docs() to construct an inverted index, a dictionary
{term:{docID1:freq1, DocID2:freq2, ...}, …}.

For each document in the “Test_Doc” folder, it firstly finds the “docid” in tag <newsitem>. For each index term (excluding number, punctuations, and tags \<p> and \</p> in \<text>), it also inserts
the term into the index or accumulates its frequency

In [59]:
def index_docs(input_paths, stop_words):
    word_count = 0
    docid = ''
    index = {}

    for path in input_paths:               
        my_file = open(f'/Users/aidanlockwood/Documents /GitHub/IFN647-Codebase/week6/wk6_data/Test_docs/{path}', 'r')

        stopwords_file = open(stop_words, 'r')
        stop_words_list = stopwords_file.readlines()
        stopwords_file.close()

        stop_words_list = stop_words_list[0].split(',')


        start_end = False
        parsed_text = []
        

        file_ = my_file.readlines()

        for line in file_:
            line = line.strip()

            if line.startswith('<text>'):
                start_end = True
            if line.startswith('<newsitem '):
                for part in line.split():
                    if part.startswith('itemid='):
                        docid = part.split('=')[1].split('/')[0]
                        docid = docid.replace('"', '')
            elif line.startswith('<p>'):
                line = line.replace('<p>', '').replace('</p>', '')
                line = line.translate(str.maketrans('', '', string.punctuation))
                line = line.replace('quot', '')
            elif line.startswith('</text>'):
                start_end = False
            if start_end:
                parsed_text.append(line)
        
        split_text = []

        for line in parsed_text:
            for word in line.split(): 
                word_count += 1

                if word.lower() not in stop_words_list and not word.isdigit():
                    word = word.lower()
                    split_text.append(stem(word))
    
        split_text.remove('<text>')

        for word in split_text:
            if word not in index:
                word_count = 1
            else:
                if docid not in index[word]:
                    index[word][docid] = 1
                else:
                    index[word][docid] += 1

            if word not in index:
                index[word] = {docid: word_count}
            elif (word in index) and (docid not in index[word]):
                index[word][docid] = word_count
                
        my_file.close()
    return index

# Testing the function on the Test_docs folder
folder_name = '/Users/aidanlockwood/Documents /GitHub/IFN647-Codebase/week6/wk6_data/Test_docs/'

# Getting the list of the files in the test_docs folder
files = os.listdir(folder_name)
files.pop(0)

print(files)
index_docs(files, 'wk6_data/common-english-words.txt')

['809481newsML.xml', '809495newsML.xml', '807600newsML.xml', '807606newsML.xml']


{'sherritt': {'809481': 5},
 'intern': {'809481': 3},
 'corp': {'809481': 3, '807600': 3, '807606': 1},
 'tuesday': {'809481': 1, '809495': 3, '807600': 1, '807606': 2},
 'whollyown': {'809481': 1},
 'metallurg': {'809481': 1},
 'technolog': {'809481': 1},
 'busi': {'809481': 1, '807600': 3},
 'plan': {'809481': 1},
 'acquir': {'809481': 1},
 'outstand': {'809481': 1},
 'share': {'809481': 1, '807600': 1, '807606': 2},
 'dynatec': {'809481': 8},
 'ltd': {'809481': 1},
 'privat': {'809481': 1},
 'mine': {'809481': 1},
 'drill': {'809481': 1},
 'servic': {'809481': 1, '807600': 2},
 'compani': {'809481': 5, '807600': 3},
 'spin': {'809481': 1},
 'merg': {'809481': 1},
 'new': {'809481': 1, '807600': 1},
 'form': {'809481': 1},
 'merger': {'809481': 1},
 'consult': {'809481': 1},
 'inc': {'809481': 1, '807600': 3, '807606': 1},
 'call': {'809481': 1, '809495': 2},
 'file': {'809481': 1},
 'canadian': {'809481': 2, '807600': 3, '807606': 1},
 'secur': {'809481': 2, '809495': 3},
 'regul': 

## Task 2

Let Q = {q1:1, q2:1, …, qn:1} be a dictionary, please define
a function likelihood_IR(I, Q) to estimate P(Q|D), i.e., it returns
the score of document D for the given query Q by using (insert function).

where fqi,D is the number of times word qi occurs in document
D, and |D| is the number of words in D.

In [61]:
def likelihood_IR(I, Q):
    L = {}
    R = {}

    D_len = {}

    for list in I.items():
        for id in list[1].items():
            R[id[0]] = 1
            D_len[id[0]] = 0.5
        if (list[0] in Q):
            L[list[0]] = I[list[0]]
    
    for q_term in Q.items():
        if not (q_term[0] in L):
            L[q_term[0]] = {}
    
    for list in I.items():
        for id in list[1].items():
            D_len[id[0]] += D_len[id[0]] + id[1]
    
    for (d, sd) in R.items():

        for (term, f) in L.items():
            if not (d in f):
                f[d] = 0
            
            sd = sd * (f[d] / D_len[d])

        R[d] = sd

    return R

In [70]:
q2_terms = index_docs(files, 'wk6_data/common-english-words.txt')

query = {'compani' : 1}

IR_result = likelihood_IR(q2_terms, query)
print(IR_result)

{'809481': 2.550546549747256e-19, '807600': 2.838968754325751e-51, '807606': 0.0, '809495': 0.0}
