In [1]:
import itertools
import re
import sys
from stemming.porter2 import stem

In [2]:
def sort_stopwords():

    # Converts the stopwords file into a list and appends 'id', 'text' and 'headline' as stopwords so these aren't
    # considered in the inverted index

    stopwords_file = open('stopwordsfile.txt', 'r').readlines()
    stopwords = []

    for word in stopwords_file:
        stopwords.append(word.strip())

    stopwords.append('id')
    stopwords.append('text')
    stopwords.append('headline')
    
    return set(stopwords)

In [3]:
# returns a list of dictionary objects which are the original files by {doc number: text}
# and writes the preprocessed text to a txt file

def split_file(file_name):

    # splits each document into string lists [doc_id, text, text .. ] for easier preprocessing
    # get the list of documents positions [0, 3, 6, 9, .....]
    file = open(file_name, 'r').readlines()

    file_pos = []
    for item in file:
        if re.match('^ID:', item):
            file_pos.append(file.index(item))
    file_pos.append(len(file))

    # list of position of every file [[0,3] , [3,6] , ....]
    # i -> i+2 corresponds for: [id , headline , text]
    positions = ([file_pos[i:i + 2] for i in range(len(file_pos) + 1 - 2)])
    newfile = []
    for i in positions:
        [a, b] = i
        newfile.append(file[a:b])

    return newfile

In [29]:
def build_index(file_name):

    print("\nBUILDING INDEX\n...")

    stopwords = sort_stopwords()
    inv_index = []

    # Preprocesses and indexes collection per document.============================================================

    for document in split_file(file_name):
        docnumber = re.sub("[^0-9]", '', document[0])
        document.pop(0)     # remove ID line

        # Tokenization
        text = ', '.join(document)
        text.replace('\n', '')
        tokenizedline = re.split('[\W]', text)
        tokenizedline = filter(None, tokenizedline)

        # Case folding & Stemming
        processed_text = []
        for word in tokenizedline:
            word = word.lower()
            if word not in stopwords and not word.isdigit() :
                processed_text.append(stem(word))

        # remove headlin and text 
        if 'headlin' in processed_text:
            processed_text.remove('headlin')
        if 'text' in processed_text:
            processed_text.remove('text')

        # Builds an index for each document then appends each to a large index for full collection=================
        indexes_per_document = []

        for word in processed_text:
            word_occurrences = {}
            term_obj = {}
            positions = [i+1 for i, x in enumerate(processed_text) if x == word] # All positions of a word per document
            word_occurrences[docnumber] = positions     # Dictionary for {document:[list of positions in doc]}
            term_obj[word] = word_occurrences           # Dictionary for {term: {document: [list of positions in doc]}}
            if term_obj not in indexes_per_document:    # avoid repitition
                indexes_per_document.append(term_obj)
        inv_index.append(indexes_per_document)

    # Sort and group inverted index by word=========================================================================
    inv_index = list(itertools.chain.from_iterable(inv_index))
    inv_index.sort(key=lambda d: sorted(d.keys()))                            # keys = words in inv_index
    inv_index = itertools.groupby(inv_index, key=lambda x: sorted(x.keys()))  # keys = words


    # Format and save to index file=================================================================================
    f = open('index.txt', 'w')

    for word, positions in inv_index:
        string_word = "{}:\n".format(''.join(word))
        f.write(string_word)
        list_positions = []
        for x in list(positions):            # {term: {document: [list of positions in doc]}}
            for key, v in x.items():
                list_positions.append(v)
        for item in list_positions:          # {document:[list of positions in doc]}
            for doc, pos in item.items():
                string_position = "\t{}: {}\n".format(doc, (','.join(map(str, pos))))
                f.write(string_position)
        f.write('\n')

    print("INDEXING COMPLETE\n")
    f.close()

In [30]:
# apply indexation on txt file
filename = 'collections\\trec.sample.txt'
build_index(filename)


BUILDING INDEX
...
INDEXING COMPLETE

