Importing the essential libraries

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Defining the documents

In [None]:
documents = {
    "d1": "Herbivores are typically plant eaters and not meat eaters",
    "d2": "Carnivores are typically meat eaters and not plant eaters",
    "d3": "Deers eat grass and leaves"
}

custom_stopwords = {"are", "and", "not"}

Preprocessing the documents

In [None]:
def preprocess(text):
    tokens = nltk.word_tokenize(text.lower())
    filtered_tokens = [t for t in tokens if t.isalpha() and t not in custom_stopwords]
    lemmatized_tokens = [lemmatizer.lemmatize(t) for t in filtered_tokens]
    return lemmatized_tokens

Printing the preprocessed documents

In [None]:
preprocessed_docs = {}
print("Preprocessed Documents:")
for doc_id, text in documents.items():
    preprocessed = preprocess(text)
    preprocessed_docs[doc_id] = preprocessed
    print(f"{doc_id}: {preprocessed}")


Preprocessed Documents:
d1: ['herbivore', 'typically', 'plant', 'eater', 'meat', 'eater']
d2: ['carnivore', 'typically', 'meat', 'eater', 'plant', 'eater']
d3: ['deer', 'eat', 'grass', 'leaf']


In [None]:
inverted_index = {}

for doc_id, words in preprocessed_docs.items():
    for word in words:
        if word not in inverted_index:
            inverted_index[word] = set()
        inverted_index[word].add(doc_id)

In [None]:
for word, doc_ids in inverted_index.items():
    print(f"{word}: {sorted(doc_ids)}")

herbivore: ['d1']
typically: ['d1', 'd2']
plant: ['d1', 'd2']
eater: ['d1', 'd2']
meat: ['d1', 'd2']
carnivore: ['d2']
deer: ['d3']
eat: ['d3']
grass: ['d3']
leaf: ['d3']
