<h1>Presentation du projet d'indexion de texte</h1>

In [None]:
from os import listdir
from os.path import isfile, join
from ipywidgets import FloatProgress
from IPython.display import display
from bs4 import BeautifulSoup as bs
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
import nltk
import string
import operator
import shutil,os
import re
import sys

In [None]:
# Represent the constraint of memory
NB_DOCUMENT = 1000
DATA_PATH = "../data/latimes/"
WRITING_PATH_POSTING_LIST = "../data/save/"
NAME_POSTING_LIST = "postingList_"
SEPARATOR = " "
NAME_DOC_LIST = "docList"

# link the tags with the importance in the text.
TAGS_IMPORTANCE = {  'headline': 3,
                     'text': 1,
                     'section':1,
                     'graphic':2
                  }
STOP_WORDS = stopwords.words('english') + list(string.punctuation)
STEMMER = PorterStemmer()
TAG_NUMBER = "NUMBER"

In [None]:
##
# clean repository of the giver path: "folder"
###
def cleanRepository(folder):
    for the_file in os.listdir(folder):
        file_path = os.path.join(folder, the_file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
            #elif os.path.isdir(file_path): shutil.rmtree(file_path)
        except Exception as e:
            print(e)

In [None]:
##
# Tokenize a sentense.
##
def tokenizeWord(paragraphContent):  
    # We tokenize and remove the stop word
    words = [word for word in word_tokenize(paragraphContent.lower()) if word not in STOP_WORDS]
    
    # nlkt does not decompose the hyphen.
    splitHiphen = []
    for word in words:
        if '-' in word:
            for decomposedWord in word.split('-'):
                splitHiphen.append(decomposedWord)
        else:
            splitHiphen.append(word)  
            
    return splitHiphen

In [None]:
##
# Format the text in the right form.
# Tokenize and stem the text
# Update the voc list passed in parameter.
##
def handleFormatText(paragraphContent, vocList, docLenght, docId):  
    # We tokenize and remove the stop word
    words = tokenizeWord(paragraphContent) 
    
    stemWords = []
    # We loop on each word.
    for word in words:
        stemWord = STEMMER.stem(word)
        
        # Selection on a part of string.
        stemWord = re.sub("[*\'\.+:,\`:/]", '', stemWord)
        if stemWord.isdigit() or len(stemWord) < 2:
            continue
            
        stemWords.append(stemWord)
        # Update the listVoc
        if stemWord in vocList:
            vocList[stemWord] = vocList[stemWord] + 1
        else:
            vocList[stemWord] = 1
        
        docLenght[docId] += 1
    return stemWords

In [None]:
##
# The function add the entry in the correct posting list
##
def buildPostingList(stemWords, currentDict, idDoc):
    # We update the stemWords.
    for word in stemWords:
        # The word have already been seen, we update thedict
        if word in currentDict :
            # We update the dict reprensenting the posting list.
            if idDoc in currentDict[word]:
                currentDict[word][idDoc] = currentDict[word][idDoc] + 1

            else:
                currentDict[word][idDoc] = 1

        # We don't have word for now
        else:
            currentDict[word] = {idDoc : 1};
            
    return

In [None]:
##
# Write file.
##
def writingInFile(currentDict, index, path, name, separator):  
    # sort word for the posting list.
    sorted_word = sorted(currentDict.keys())
    
    # write the posting list.
    with open(path+name+str(index),"a+") as f:
        for word in sorted_word:
            portingEntry = word + separator
            for docID, value in currentDict[word].items():
                portingEntry = portingEntry + str(docID) + separator + str(value) + separator
            f.write(portingEntry + '\n')

In [None]:
##
# Write voc file
##
def writingDictInFile(currentDict, path, name, separator): 
    # write the posting list.
    with open(path+name ,"a+") as f:
        for docID, value in currentDict.items():
            portingEntry = ""
            portingEntry = portingEntry + str(docID) + separator + str(value)
            f.write(portingEntry + '\n')

In [None]:
##
# The function build the index file composed by the voc and the associated posting list.
##
def buildIndexFile(vocList, docLenght) :
    print("Building index File")
    
    # We get the list of file containing the articles.
    articles = [DATA_PATH + file for file in listdir(DATA_PATH) if (isfile(join(DATA_PATH, file)) and ".txt" not in file and ".DS_Store" not in file )]
    progress_bar = FloatProgress(min=0, max=len(articles))
    display(progress_bar)
    
    # List containing the term and the number of time it appear.
    currentPostingList = {}
    counter = 0
    docIDCounter = 0 
    
    
    #We loop on each document composing the corpus.
    for article in articles:
        with open(article) as curArticle:
            file = curArticle.read()
            fileXML = bs(file,"lxml")
            
            # We loop on each doc tag
            for document in fileXML.findAll('doc'):
                docIDCounter = docIDCounter + 1
                docID = document.find("docid").string
                docLenght[docID] = 0
                
                # get the text containing in the current article
                curParagraph = document.find_all('p')
                for paragraph in curParagraph:
                   
                    # We balance with the importance of the parent tag
                    if paragraph.parent.name in TAGS_IMPORTANCE:
                        for index in range(TAGS_IMPORTANCE[paragraph.parent.name]):
                            stemWords = handleFormatText(paragraph.string,vocList, docLenght, docID)
                            buildPostingList(stemWords, currentPostingList, int(docID))
                             
                if docIDCounter % NB_DOCUMENT == 0 :
                    counter = counter + 1
                    writingInFile(currentPostingList, counter, WRITING_PATH_POSTING_LIST, NAME_POSTING_LIST, SEPARATOR)
                    # clear the ram memory.
                    currentPostingList.clear()
             
        curArticle.closed
        progress_bar.value += 1

In [None]:
def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)

In [None]:
class PostingList(object):
    # Args :
    # qt is a string containing the query term this PL is made for
    # ordered_list is a list of (score, doc_id) ordered in decreasing score
    # access_dict (optional, can be computed from ordered_list) is a dict associating a doc to its score in this PL
    def __init__(self, qt, ordered_list, access_dict=None):
        self.qt=qt
        self.ordered_list = ordered_list
        if access_dict is not None:
            self.access_dict = access_dict
        else:
            self.access_dict = {}
            for score,doc in ordered_list:
                assert doc not in self.access_dict
                self.access_dict[doc] = score

        self.docs_visited = set()
        self.ordered_idx = 0

    # Returns : A (score, doc_index) tuple corresponding to the first non-visited entry in the ordered traversal
    def seek_next(self):
        while self.ordered_list[self.ordered_idx][1] in self.docs_visited:
            self.ordered_idx += 1
        return self.ordered_list[self.ordered_idx]

    # Returns : The score of the item preceding the next ordered accessed item
    def next_item_predecessor_score(self):
        tmp_idx = self.ordered_idx
        while self.ordered_list[tmp_idx][1] in self.docs_visited:
            self.ordered_idx += 1
        return self.ordered_list[tmp_idx-1][0]

    # Args :
    # doc_id is an integer containing the id of the document we want to mark as visited in the sorted access
    def mark_visited(self, doc_id):
        assert doc_id not in self.docs_visited
        self.docs_visited.add(doc_id)

    # Args :
    # doc_id is an integer containing the document id to lookup in the random access
    #
    # Returns : The score of the queried document in the PL
    def random_lookup(self, doc_id):
        return self.access_dict[doc_id]

#TODO(mathishammel): Use a real priority queue. Lists are disgusting
class TopEntries(object):
    def __init__(self, k):
        self.k = k
        self.top = []

    def insert(self, priority, element):
        self.top += [(priority, element)]
        self.top = sorted(self.top, reverse=True)[:self.k]

    def pop_lowest(self):
        res = self.top[-1]
        del self.top[-1]
        return res

    def get_min_score(self):
        if len(self.top) == 0:
            return -1.0
        return self.top[-1][0]

def calc_avg(lst):
    return float(sum(lst))/len(lst)

# Args :
# posting_lists is a list of PostingList objects corresponding to the PLs for all query terms.
# k is an integer containing the length of the desired top-k ranking
#
# Returns : A list containing the (total_score, doc_id) for the top-k elements
def top_k_thresh(posting_lists, k, epsilon=0.0):
    #Check if we're not trying to get an impossible top-k
    for posting_list in posting_lists:
        assert k < len(posting_list.ordered_list)

    top_k = TopEntries(k)
    top_non_visited = []
    for posting_list in posting_lists:
        top_non_visited.append(posting_list.seek_next())

    eprint('Initialized top inorder array :', top_non_visited)
    threshold = 1e999 # Close enough to infinity, hopefully...
    
    while top_k.get_min_score() < threshold / (1.0 + epsilon):
        eprint('Starting new round')
        selected_idx = -1
        best_indiv_in_order = -1.0 # Assuming all PL scores are positive
        for idx, score in enumerate(top_non_visited):
            if best_indiv_in_order < score:
                selected_idx = idx
                best_indiv_in_order = score
        selected_element = top_non_visited[selected_idx]
        selected_score, selected_doc_id = selected_element
        eprint('  Selected PL index is', selected_idx)
        eprint('  Selected element is', selected_element)
        posting_lists[selected_idx].mark_visited(top_non_visited[selected_idx][1])
        top_non_visited[selected_idx] = posting_lists[selected_idx].seek_next()

        scores = []
        for idx in range(len(posting_lists)):
            if idx == selected_idx:
                scores.append(selected_score)
                continue
            scores.append(posting_lists[idx].random_lookup(selected_doc_id))
            posting_lists[idx].mark_visited(selected_doc_id)
            if top_non_visited[idx][1] == selected_doc_id:
                top_non_visited[idx] = posting_lists[idx].seek_next()
        
        eprint('  Individual scores for document',selected_doc_id,'are', scores)
        tot_score = calc_avg(scores)
        eprint('  Average score is', tot_score)
        top_k.insert(tot_score, selected_doc_id)
        eprint('  Current top K is', top_k.top)

        all_lists_ready = True
        next_prev_scores = []
        for idx,posting_list in enumerate(posting_lists):
            if posting_list.ordered_list[0][1] not in posting_list.docs_visited:
                eprint('  Posting list',idx,'is not ready for threshold computation yet, aborting.')
                all_lists_ready = False
                break
            next_prev_scores.append(posting_list.next_item_predecessor_score())
        eprint('  Nextprev scores are',next_prev_scores)
        threshold = calc_avg(next_prev_scores)
    return top_k.top

In [None]:
#Test Fagin's 2 on ppt example
pl1 = PostingList('hello', [(0.9,2),(0.8,5),(0.7,6),(0.6,4),(0.5,1),(0.4,3)])
pl2 = PostingList('world', [(0.85,3),(0.8,5),(0.75,2),(0.74,6),(0.74,1),(0.7,4)])

print top_k_thresh([pl1, pl2], 3)

<h2>Creation du index File</h2>

In [None]:
cleanRepository(WRITING_PATH_POSTING_LIST)

In [None]:
vocList = {}
docLenght = {}
buildIndexFile(vocList, docLenght)

In [None]:
writingDictInFile(docLenght, WRITING_PATH_POSTING_LIST, NAME_DOC_LIST, " ")