Importing necessary packages 

In [151]:
import os
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pickle
import json
import re

# nltk.download('punkt')
# nltk.download('stopwords')

For 5 files: file1.txt, file10.txt, file100.txt, file101.txt and file102.txt

In [152]:
source_directory = 'text_files'

stop_words = set(stopwords.words('english'))
punctuation_table = str.maketrans('', '', string.punctuation)
count = 0
for filename in os.listdir(source_directory):
    if filename.endswith('.txt'):
        print("-----------------------------------------------------------------------")
        print(filename)
        file_path = os.path.join(source_directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            print("Original Text:", text)
            text = text.lower()  # Lowercase the text
            print("1. Lower Case: ", text)
            tokens = word_tokenize(text)  # Tokenize
            print("2. tokenized: ", ' '.join(tokens))
            tokens = [w.translate(punctuation_table) for w in tokens]  # Remove punctuations
            print("3. Punctuation Removed:", ' '.join(tokens))
            tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
            print("4. Stopwords Removed:", ' '.join(tokens))
            tokens = [word for word in tokens if word.strip()]  # Remove blank space tokens
            print("5. Space Removed:", ' '.join(tokens))
            count+=1
    if count==5:
        break

-----------------------------------------------------------------------
file1.txt
Original Text: Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.
1. Lower Case:  loving these vintage springs on my vintage strat. they have a good tension and great stability. if you are floating your bridge and want the most out of your springs than these are the way to go.
2. tokenized:  loving these vintage springs on my vintage strat . they have a good tension and great stability . if you are floating your bridge and want the most out of your springs than these are the way to go .
3. Punctuation Removed: loving these vintage springs on my vintage strat  they have a good tension and great stability  if you are floating your bridge and want the most out of your springs than these are the way to go 
4. Stopwords Removed: loving vintage springs vintage strat  g

In [153]:
def preprocessing(text):
    text = text.lower()
    tokens = word_tokenize(text)  # Tokenize
    tokens = [w.translate(punctuation_table) for w in tokens]  # Remove punctuations
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    tokens = [word for word in tokens if word.strip()]  # Remove blank space tokens
    text = ' '.join(tokens)
    return text


In [154]:
all_documents = []
destination_directory = 'preprocessed'
if not os.path.exists(destination_directory):
    os.makedirs(destination_directory)

stop_words = set(stopwords.words('english'))
punctuation_table = str.maketrans('', '', string.punctuation)
for filename in os.listdir(source_directory):
    all_documents.append(filename)
    if filename.endswith('.txt'):
        file_path = os.path.join(source_directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            text = preprocessing(text)

            processed_file_path = os.path.join(destination_directory, filename)
            with open(processed_file_path, 'w', encoding='utf-8') as processed_file:
                processed_file.write(text)
all_documents = set(all_documents)

In [155]:
directory = 'preprocessed'
inverted_index = {}

for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            words = file.read().split()
            for word in words:
                if word in inverted_index:
                    if filename not in inverted_index[word]:
                        inverted_index[word].append(filename)
                else:
                    inverted_index[word] = [filename]


In [156]:
with open('inverted_index.pkl', 'wb') as f:
    pickle.dump(inverted_index, f)

In [157]:
with open('inverted_index.pkl', 'rb') as f:
    loaded_inverted_index = pickle.load(f)
    inverted_index = loaded_inverted_index
    

In [None]:
pretty_dict = json.dumps(loaded_inverted_index, indent=4)
print(pretty_dict)

In [139]:
text = 'hello america'
print(preprocessing(text))

hello america


In [159]:
def and_operation(doc_set1, doc_set2):
    return doc_set1.intersection(doc_set2)

def or_operation(doc_set1, doc_set2):
    return doc_set1.union(doc_set2)

def and_not_operation(doc_set1, doc_set2):
    return doc_set1.difference(doc_set2)

def or_not_operation(doc_set1, doc_set2, all_docs = all_documents):
    not_doc_set2 = all_docs.difference(doc_set2)
    return or_operation(doc_set1, not_doc_set2)

def get_docs_for_term(term, inverted_index):
    return set(inverted_index.get(term, []))

operation_functions = {
    'AND': and_operation,
    'OR': or_operation,
    'AND NOT': and_not_operation,
    'OR NOT': or_not_operation
}


In [None]:
n = int(input("Enter the number of queries: "))
# n = 1
for k in range(n):
    query = input(f"Query {k+1}: ")
    processed_query_list = preprocessing(query).split()

    if len(processed_query_list) == 0:
        print("Empty string")
        continue
    if len(processed_query_list) == 1:
        print(f"Only One words is searched- {processed_query_list[0]}: found in {len(get_docs_for_term(processed_query_list[0], inverted_index))} docs", get_docs_for_term(processed_query_list[0], inverted_index))
        continue
    ops = input("Enter operators (AND, OR, AND NOT, OR NOT), separated by commas: ")
    ops_list = [x.strip() for x in ops.split(',')]

    if len(ops_list) != len(processed_query_list) - 1:
        print("Mismatch between number of terms and operators")
        continue
    str = f'Query {k+1}: {processed_query_list[0]} '
    for j in range(len(ops_list)):
        str+= f'{ops_list[j]} {processed_query_list[j+1]} '
    print()

    current_docs = get_docs_for_term(processed_query_list[0], inverted_index)
    for i, op in enumerate(ops_list):
        next_docs = get_docs_for_term(processed_query_list[i+1], inverted_index)
        try:
            current_docs = operation_functions[op](current_docs, next_docs)
        except:
            break
        # if op == "AND":
        #     current_docs = and_operation(current_docs, next_docs)
        # elif op == "OR":
        #     current_docs = or_operation(current_docs, next_docs)
        # elif op == "AND NOT":
        #     current_docs = and_not_operation(current_docs, next_docs)
        # elif op == "OR NOT":
        #     current_docs = or_not_operation(current_docs, next_docs)
        # else:
        #     print(f"Unknown operation: {op}")
        #     break
    current_docs = sorted(current_docs, key=lambda x: int(re.search(r'\d+', x).group()))
    print(str)
    print(f'Number of documents retrieved for query {k+1} : {len(current_docs)}')
    print(f'Names of documents retrieved for query {k+1} : {current_docs}\n')

In [None]:
directory = 'preprocessed'
filenames = os.listdir(directory)
filenames = sorted(filenames, key=lambda x: int(re.search(r'\d+', x).group()))
positional_index = {}
for filename in filenames:
    if filename.endswith('.txt'):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            position = 0
            for word in file.read().split():
                if word not in positional_index:
                    positional_index[word] = {}
                if filename not in positional_index[word]:
                    positional_index[word][filename] = []
                positional_index[word][filename].append(position)
                position += 1

with open('positional_index.pkl', 'wb') as f:
    pickle.dump(positional_index, f)
positional_index

In [None]:
with open('positional_index.pkl', 'rb') as f:
    loaded_positional_index = pickle.load(f)