Q1

In [1]:
import os
import nltk
import random
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stopwords_set = set(stopwords.words('english'))

def refine_text(input_text, display):
    if display:
        print("Initial text:", input_text)
    text_lower = input_text.lower()
    if display:
        print("After lowercase:", text_lower)
    tokenized = word_tokenize(text_lower)
    if display:
        print("After tokenization:", ' '.join(tokenized))
    no_stopwords = [word for word in tokenized if word not in stopwords_set]
    if display:
        print("After removing stopwords:", ' '.join(no_stopwords))
    final_tokens = [word for word in no_stopwords if word.isalpha()]
    if display:
        print("After removing punctuations and blank spaces:", ' '.join(final_tokens))
    cleaned_text = ' '.join(final_tokens)
    return cleaned_text

src_dir = '/Users/abhijaysingh/Documents/College/Semester 6/Information Retrieval/Final/text_files'
dest_dir = '/Users/abhijaysingh/Documents/College/Semester 6/Information Retrieval/Final/newtextfile'
selected_files = random.sample(range(1, 1001), 5)

os.makedirs(dest_dir, exist_ok=True)

for index in range(1, 1001):
    src_file_path = os.path.join(src_dir, f'file{index}.txt')
    dest_file_path = os.path.join(dest_dir, f'newfile{index}.txt')
    
    if os.path.exists(src_file_path):
        with open(src_file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            is_debug_file = index in selected_files
            processed_text = refine_text(content, display=is_debug_file)
            
            if is_debug_file:
                print(f"\nDetailed processing for file{index}.txt")
            
        with open(dest_file_path, 'w', encoding='utf-8') as new_file:
            new_file.write(processed_text)


Initial text: I bought this item because it had good reviews and it was cost effective compared to the top name brands. So far I have only used this item in a practice setting, but it does exactly as it states. I have noticed if the product is left on for several hours it gets a little warm, but it doesn't seem to be any sort of issue

12V and 18V out allows for easy hookup into those oddball pedals (just make sure you don't plug your 9V pedal into one of those slots).

I would recommend this item to anybody compiling their first pedal board. I know you can just pop a 9V battery in most pedals and away you go, but...

Running the cost comparison of this power supply to 9V batteries makes it pretty obvious that this power supply is worth every penny. Plus you never have to worry about a dead battery in your loop ruining a gig.
After lowercase: i bought this item because it had good reviews and it was cost effective compared to the top name brands. so far i have only used this item in a 

Q2

In [4]:
import os
import pickle

def build_index(directory):
    inverted_idx = {}
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            for word in file.read().split():
                if word not in inverted_idx:
                    inverted_idx[word] = [filename]
                elif filename not in inverted_idx[word]:
                    inverted_idx[word].append(filename)
    return inverted_idx

texts_directory = '/Users/abhijaysingh/Documents/College/Semester 6/Information Retrieval/Final/newtextfile'
index = build_index(texts_directory)
with open('inverted_index_file.pkl', 'wb') as output_file:
    pickle.dump(index, output_file)

def process_queries():
    with open('inverted_index_file.pkl', 'rb') as input_file:
        index = pickle.load(input_file)
        
    num_queries = int(input("Enter number of queries: "))
    for _ in range(num_queries):
        query = input("Enter query: ")
        ops = input("Enter operations: ").split(', ')
        query = refine_text(query, display=False)
        terms = query.split()  
        all_files = set([file for sublist in index.values() for file in sublist])
        result = set(index.get(terms[0], []))
        
        for i, op in enumerate(ops):
            next_term_docs = set(index.get(terms[i + 1], []))
            if op.lower() == 'and':
                result &= next_term_docs
            elif op.lower() == 'or':
                result |= next_term_docs
            elif op.lower() == 'and not':
                result -= next_term_docs
            elif op.lower() == 'or not':
                result = (result | (all_files - next_term_docs))
        
        results = (list(result))
        
        print(f"Query: {' '.join([terms[0]] + [op + ' ' + term for op, term in zip(ops, terms[1:])])}")
        print(f"Documents found: {len(results)}")
        print(f"Document names: {', '.join(results)}")

if __name__ == "__main__":
    process_queries()


Query: guitar and sounds
Documents found: 27
Document names: newfile183.txt, newfile166.txt, newfile801.txt, newfile840.txt, newfile514.txt, newfile277.txt, newfile804.txt, newfile98.txt, newfile982.txt, newfile361.txt, newfile405.txt, newfile978.txt, newfile729.txt, newfile396.txt, newfile324.txt, newfile569.txt, newfile271.txt, newfile407.txt, newfile571.txt, newfile903.txt, newfile106.txt, newfile593.txt, newfile470.txt, newfile541.txt, newfile413.txt, newfile484.txt, newfile936.txt


Q3

In [10]:
import os
import pickle
from nltk.tokenize import word_tokenize
from collections import defaultdict

stop_words = set()

def default_list():
    return defaultdict(list)

def build_positional_index(path):
    p_index = defaultdict(default_list)
    for file in os.listdir(path):
      
            with open(os.path.join(path, file), 'r', encoding='utf-8') as content:
                tokens = [w for w in word_tokenize(content.read().lower()) if w.isalpha() and w not in stop_words]
                for pos, token in enumerate(tokens):
                    p_index[token][file].append(pos)
    return p_index

def save_index(index, filename='positional.pkl'):
    with open(filename, 'wb') as f:
        pickle.dump(index, f)

def load_index(filename='positional.pkl'):
    with open(filename, 'rb') as f:
        return pickle.load(f)

def search_phrases(index, phrase):
    words = word_tokenize(phrase.lower())
    if not words:
        return []
    docs = set(index[words[0]].keys())
    for i in range(1, len(words)):
        docs &= {d for d in index[words[i]].keys() if any(p - i in index[words[0]][d] for p in index[words[i]][d])}
    return list(docs)

dir_path = '/Users/abhijaysingh/Documents/College/Semester 6/Information Retrieval/Final/newtextfile'
pos_index = build_positional_index(dir_path)
save_index(pos_index)

loaded_idx = load_index()
query_count = int(input("Number of phrase queries: "))

for _ in range(query_count):
    query_input = input("Enter phrase query: ")
    query_input = refine_text(query_input, display=False)
    results = search_phrases(loaded_idx, query_input)
    print(f"Phrase: '{query_input}' found in {len(results)} documents: {', '.join(results)}")


Phrase: 'call' found in 1 documents: newfile96.txt
