In [96]:
import nltk
from collections import defaultdict
from nltk.stem.snowball import EnglishStemmer  # Assuming we're working with English
import string 


class Index:
    """ Inverted index datastructure """
 
    def __init__(self, tokenizer, stemmer=None, stopwords=None):
        """
        tokenizer   -- NLTK compatible tokenizer function
        stemmer     -- NLTK compatible stemmer 
        stopwords   -- list of ignored words
        """
        self.tokenizer = tokenizer
        self.stemmer = stemmer
        self.index = defaultdict(list)
        self.docindexes = {}
        self.documents = {}
        self.__unique_id = 0
        if not stopwords:
            self.stopwords = set()
        else:
            self.stopwords = set(stopwords)
 #The stopwords list is used so that the index doesn’t create an entry for every word in the English language. 
 #The words contained in such lists have ideally no semantics by their own(so, that, the,…).
#The stemmer is used to get a common form for different inflections of the base word (watching -> watch, ghostly -> ghost, etc…). 
#The stem of the word is not necessarily a dictionary word. Stemmers use heuristic approaches for determining the base form of the word fast
#Ex: looking for "centrality" would be in fact looking for "central"   
   
    def word_counts(self, text, word):
        """Return a vector that represents the counts of specific words in the text
        word_counts("Here is sentence one. Here is sentence two.", ['Here', 'two', 'three'])
        [2, 1, 0]
        emma = nltk.corpus.gutenberg.raw('austen-emma.txt')
        word_counts(emma, ['the', 'a'])
        [4842, 3001]
        """  

        words = nltk.word_tokenize(text) 
        counts =  nltk.FreqDist(words)   # this counts all word occurences
        return str(counts[word])
        #return counts[word]
        #return [counts[x] or 0 for x in words] # this returns what was counted for *words

    
    def lookup(self, word):
        """
        Lookup a word in the index
        """
        word = word.lower()
        #if self.stemmer:
            #word = self.stemmer.stem(word)
           
        result=[]
        
        for id in self.index.get(word):
            count=self.word_counts(self.documents.get(id, None), word)
            result.append(tuple((id, count)))
            
        return result
        
        #self.documents.get(id, None) -  returns the document with id "id"
        #self.index.get(word) - returns list of the ids of the documents with that word    
        
    def add(self, document):
        """
        Add a document string to the index
        """
        for token in [t.lower() for t in nltk.word_tokenize(document)]:
            if token in self.stopwords:
                continue
 
            #if self.stemmer:
                #token = self.stemmer.stem(token)
        #Add this word to this index
            if self.__unique_id not in self.index[token]:
                self.index[token].append(self.__unique_id)
                
        #Add this document to a list in this index
        self.documents[self.__unique_id] = document
        self.__unique_id += 1    

    


In [100]:

index = Index(nltk.word_tokenize, EnglishStemmer(), nltk.corpus.stopwords.words('english'))

filename = 'file.txt'
file_to_read = open(filename)
lines = [line.rstrip('\n') for line in file_to_read]
documents=lines

for document_key in documents:
    with open(document_key, 'r') as file:
        file_to_index=file.read().replace('\n', '')
        index.add(file_to_index)    

print("Centrality exists in the documents x with frequency y: (x,y)")
print(index.lookup("centrality"))

Centrality exists in the documents x with frequency y: (x,y)
[(0, '1'), (1, '3'), (2, '1')]


In [None]:
"Duvidas: Temos de cosntruir a contagem do indice logo à priori ou só quao pedem certas palavras?"
"Se nao, entao como melhoramos aquela ineficiencia do word_counting?"