<a href="https://colab.research.google.com/github/arya-snh/CSE508_Winter2024_A1_2020498/blob/main/IR_Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re
import pickle
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

## Q1. Data Preprocessing

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(input_text):
    # Lowercase the text
    # print("\nPerforming operation a. Lowercase the text")
    lowercased_text = input_text.lower()
    # print(lowercased_text)

    # Tokenization
    # print("\nPerforming operation b. Perform tokenization")
    tokens = word_tokenize(lowercased_text)
    # print(tokens)

    # Remove stopwords
    # print("\nPerforming operation c. Remove stopwords")
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # print(filtered_tokens)

    # Remove punctuations
    # print("\nPerforming operation d. Remove punctuations")
    filtered_tokens = [token for token in filtered_tokens if token not in punctuation]
    # print(filtered_tokens)

    # Remove blank space tokens
    # print("\nPerforming operation e. Remove blank space tokens")
    filtered_tokens = [token for token in filtered_tokens if token.strip()]
    # print(filtered_tokens)

    # Join tokens back into a string
    preprocessed_text = ' '.join(filtered_tokens)
    return preprocessed_text

def preprocess_folder(input_folder, output_folder):
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(input_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                original_text = file.read()

            preprocessed_text = preprocess_text(original_text)

            output_file_path = os.path.join(output_folder, filename)
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                output_file.write(preprocessed_text)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
data_path = '/content/drive/MyDrive/IR/A1/text_files'
preprocessed_data_path = '/content/drive/MyDrive/IR/A1/preprocessed_text_files'
# preprocess_folder(data_path, preprocessed_data_path)

In [None]:
## Print 5 samples
for filename in os.listdir(data_path)[2:7]:
    file_path = os.path.join(data_path, filename)
    with open(file_path, 'r', encoding='utf-8') as file:
        original_text = file.read()
    print(f"\nOriginal content of {filename}:\n{original_text}\n")

    preprocessed_text = preprocess_text(original_text)
    print(f"\nPreprocessed content of {filename}:\n{preprocessed_text}\n")



Original content of file903.txt:
This guitar is perfect in every way! It's so easy to play and sounds fantastic! Perfectly balanced with zero neck dive. The bomb inlays just look cool! I definitely plan on buying more Hardluck Kings guitars!


Performing operation a. Lowercase the text
this guitar is perfect in every way! it's so easy to play and sounds fantastic! perfectly balanced with zero neck dive. the bomb inlays just look cool! i definitely plan on buying more hardluck kings guitars!

Performing operation b. Perform tokenization
['this', 'guitar', 'is', 'perfect', 'in', 'every', 'way', '!', 'it', "'s", 'so', 'easy', 'to', 'play', 'and', 'sounds', 'fantastic', '!', 'perfectly', 'balanced', 'with', 'zero', 'neck', 'dive', '.', 'the', 'bomb', 'inlays', 'just', 'look', 'cool', '!', 'i', 'definitely', 'plan', 'on', 'buying', 'more', 'hardluck', 'kings', 'guitars', '!']

Performing operation c. Remove stopwords
['guitar', 'perfect', 'every', 'way', '!', "'s", 'easy', 'play', 'sounds'

## Q2. Unigram Inverted Index

Freq of docs in inverted index ??

In [None]:
def create_inverted_index(input_folder):
    inverted_index = {}

    for filename in os.listdir(input_folder):
        file_path = os.path.join(input_folder, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            document_text = file.read()

            terms = document_text.split()

            for term in set(terms):
                if term not in inverted_index:
                    inverted_index[term] = [filename]
                else:
                    inverted_index[term].append(filename)

In [None]:
def print_inverted_index(inverted_index):
    for term, documents in inverted_index.items():
        print(f"{term}: {', '.join(documents)}")

In [None]:
inverted_index = create_inverted_index(preprocessed_data_path)
output_file_path = "/content/drive/MyDrive/IR/A1/inverted_index.pkl"

# save using pickle
with open(output_file_path, 'wb') as file:
        pickle.dump(inverted_index, file)

# load from pickle
with open(output_file_path, 'rb') as file:
        inverted_index = pickle.load(file)

print_inverted_index(inverted_index)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
behringer: file444.txt, file29.txt, file855.txt, file755.txt, file597.txt, file306.txt, file629.txt, file637.txt, file794.txt, file545.txt, file510.txt, file807.txt, file938.txt
lr: file444.txt, file804.txt
character: file444.txt, file688.txt, file637.txt, file541.txt, file220.txt, file106.txt, file916.txt, file253.txt
knock: file444.txt, file309.txt, file622.txt, file993.txt, file654.txt, file396.txt, file13.txt, file674.txt
manual: file444.txt, file870.txt, file270.txt, file703.txt, file523.txt, file575.txt
bandwidth: file444.txt
upper: file444.txt
user: file444.txt, file491.txt, file722.txt, file969.txt, file270.txt, file196.txt, file711.txt
sweetening: file444.txt
performs: file444.txt, file307.txt
applications: file444.txt, file130.txt, file998.txt
handling: file444.txt
low: file444.txt, file858.txt, file388.txt, file507.txt, file868.txt, file111.txt, file883.txt, file629.txt, file274.txt, file63.txt, file270.txt, fi

In [None]:
def execute_query(query, inverted_index):
    # Split the query into terms and operations
    query_parts = query.split()

    all_documents = set(os.listdir(preprocessed_data_path))
    result_documents = set(inverted_index.get(query_parts[0]))

    i = 1
    while i < len(query_parts):

        term = query_parts[i]

        if term == "AND":
            if (i < len(query_parts) - 1) and query_parts[i+1] == "NOT":
                result_documents = result_documents.difference(inverted_index.get(query_parts[i+2], set()))
                i = i+3
            else:
                result_documents = result_documents.intersection(inverted_index.get(query_parts[i+1], set()))
                i = i+2

        elif term == "OR":
            if (i < len(query_parts) - 1) and query_parts[i+1] == "NOT":
                result_documents = result_documents.union(set(all_documents).difference(inverted_index.get(query_parts[i+2], set())))
                i = i+3
            else:
                result_documents = result_documents.union(inverted_index.get(query_parts[i+1], set()))
                i = i+2

    return list(result_documents)

In [None]:
inverted_index_path = "/content/drive/MyDrive/IR/A1/inverted_index.pkl"

with open(inverted_index_path, 'rb') as file:
        inverted_index = pickle.load(file)

N = int(input())

for i in range(N):
    input_sequence = input().strip()
    operations = input().split(', ')

    # Preprocess the input sequence
    preprocessed_input = preprocess_text(input_sequence)
    preprocessed_input__ = preprocessed_input.split()
    query = ""
    for item1, item2 in zip(preprocessed_input__, operations):
        query += str(item1) + " " + str(item2) + " "

    # Add the last element of the first list
    query += str(preprocessed_input__[-1])

    # Execute the query
    result_documents = execute_query(query, inverted_index)

    # Print the output
    print("\nQuery:", query)
    print("Number of documents retrieved:" , len(result_documents))
    print("Names of the documents retrieved", result_documents)
    print()

2
Car bag in a canister
OR, AND NOT

Query: car OR bag AND NOT canister
Number of documents retrieved: 31
Names of the documents retrieved ['file780.txt', 'file404.txt', 'file942.txt', 'file864.txt', 'file264.txt', 'file459.txt', 'file892.txt', 'file313.txt', 'file542.txt', 'file860.txt', 'file863.txt', 'file686.txt', 'file174.txt', 'file665.txt', 'file363.txt', 'file682.txt', 'file118.txt', 'file466.txt', 'file573.txt', 'file698.txt', 'file73.txt', 'file981.txt', 'file166.txt', 'file699.txt', 'file956.txt', 'file930.txt', 'file886.txt', 'file797.txt', 'file746.txt', 'file738.txt', 'file3.txt']

Coffee brewing techniques in cookbook
AND, OR NOT, OR

Query: coffee AND brewing OR NOT techniques OR cookbook
Number of documents retrieved: 999
Names of the documents retrieved ['file362.txt', 'file71.txt', 'file376.txt', 'file700.txt', 'file226.txt', 'file827.txt', 'file826.txt', 'file965.txt', 'file583.txt', 'file161.txt', 'file13.txt', 'file205.txt', 'file517.txt', 'file360.txt', 'file990.

## Q3. Positional Index and Phrase Queries


In [None]:
def create_positional_index(input_folder):
    positional_index = {}

    for filename in os.listdir(input_folder):
        file_path = os.path.join(input_folder, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            document_text = file.read()

            terms = document_text.split()
            position = 0  # Position counter for each term in the document

            for term in terms:
                position += 1
                if term not in positional_index:
                    positional_index[term] = {filename: [position]}
                else:
                    if filename not in positional_index[term]:
                        positional_index[term][filename] = [position]
                    else:
                        positional_index[term][filename].append(position)

    return positional_index

In [None]:
positional_index = create_positional_index(preprocessed_data_path)

In [None]:
output_file_path = "/content/drive/MyDrive/IR/A1/positional_index.pkl"

In [None]:
with open(output_file_path, 'wb') as file:
        pickle.dump(positional_index, file)

In [None]:
with open(output_file_path, 'rb') as file:
        positional_index = pickle.load(file)

In [None]:
def phrase_query(positional_index, phrase):
    phrase_terms = phrase.split()

    if len(phrase_terms) < 1:
        return []

    potential_documents = set(positional_index.get(phrase_terms[0], set()))

    for term in phrase_terms[1:]:
        potential_documents = potential_documents.intersection(set(positional_index.get(term, set())))

    final_documents = set()
    flag = True

    for document in list(potential_documents):
        positions = positional_index[phrase_terms[0]][document]
        for term in phrase_terms[1:]:
            flag = False
            for pos in positional_index[term][document]:
                if (pos - 1) in positions:
                    flag = True
                    continue
            positions = positional_index[term][document]
        if flag:
            final_documents.add(document)

    return list(final_documents)

In [None]:
N = int(input())
print()
for i in range(N):
    phrase = preprocess_text(input())

    matching_documents = phrase_query(positional_index, phrase)

    print("\nNumber of documents retrieved for the phrase '{}' using positional index:".format(phrase), len(matching_documents))
    print("Names of documents retrieved for the phrase '{}' using positional index:".format(phrase), sorted(matching_documents))
    print()

1

happiness is a pain

Number of documents retrieved for the phrase 'happiness pain' using positional index: 0
Names of documents retrieved for the phrase 'happiness pain' using positional index: []

