In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [12]:
# Import necessary libraries
from google.colab import drive
import nltk
import os
import string
import random
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

# Mount Google Drive
drive.mount('/content/drive')

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Define the text preprocessing function
def preprocess_text(text, print_content=False):
    # Print original text if required
    if print_content:
        print("Original Text:\n", text)

    # Parse HTML content if any, using BeautifulSoup
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()

    # Lowercase the text
    text_lower = text.lower()
    if print_content:
        print("\nAfter Lowercasing:\n", text_lower)

    # Tokenization
    tokens = word_tokenize(text_lower)
    if print_content:
        print("\nAfter Tokenization:\n", tokens)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens_no_stopwords = [word for word in tokens if word not in stop_words]
    if print_content:
        print("\nAfter Removing Stopwords:\n", tokens_no_stopwords)

    # Remove punctuation and keep only alphabetic tokens
    tokens_no_punctuation = [word for word in tokens_no_stopwords if word.isalpha()]
    if print_content:
        print("\nAfter Removing Punctuation:\n", tokens_no_punctuation)

    # Join tokens back to a single string
    preprocessed_text = ' '.join(tokens_no_punctuation)

    return preprocessed_text


# Function to preprocess files within a specified directory
def preprocess_files(directory_path):
    # Create a folder to store preprocessed files if it doesn't exist
    preprocessed_dir = os.path.join(directory_path, 'preprocessed_files')
    os.makedirs(preprocessed_dir, exist_ok=True)

    # Get a list of file paths excluding directories
    file_paths = [os.path.join(directory_path, file_name) for file_name in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, file_name))]

    # Randomly select 5 files to display contents before and after preprocessing
    sample_files = random.sample(file_paths, min(5, len(file_paths)))

    for file_path in file_paths:
        # Read file content
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # Check if the file is one of the sample files to print content
        print_content = file_path in sample_files

        # Preprocess text
        preprocessed_text = preprocess_text(text, print_content)

        # Save the preprocessed text to a new file in the preprocessed_files directory
        file_name = os.path.basename(file_path)
        preprocessed_file_path = os.path.join(preprocessed_dir, file_name.replace('.txt', '_preprocessed.txt'))
        with open(preprocessed_file_path, 'w', encoding='utf-8') as file:
            file.write(preprocessed_text)

        if print_content:
            print(f"\nPreprocessed text saved: {preprocessed_file_path}\n")
            print('//////////////////////////////////////////////\n')

# Usage example with a Google Drive path
directory_path = '/content/drive/My Drive/IR_Assignment1/text_files'  # Adjust this path as needed
preprocess_files(directory_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  soup = BeautifulSoup(text, "html.parser")


Original Text:
 Love the blue glow of the power supply on my pedal board.  I've only had it for a day so I don't know how it will hold up, but i'm sure it will be fine.  Comes with all of the cables to fully power your pedal board.  The A/C adapter cable is only 3 feet long so consider buying an extension chord or power strip to plug it into.

After Lowercasing:
 love the blue glow of the power supply on my pedal board.  i've only had it for a day so i don't know how it will hold up, but i'm sure it will be fine.  comes with all of the cables to fully power your pedal board.  the a/c adapter cable is only 3 feet long so consider buying an extension chord or power strip to plug it into.

After Tokenization:
 ['love', 'the', 'blue', 'glow', 'of', 'the', 'power', 'supply', 'on', 'my', 'pedal', 'board', '.', 'i', "'ve", 'only', 'had', 'it', 'for', 'a', 'day', 'so', 'i', 'do', "n't", 'know', 'how', 'it', 'will', 'hold', 'up', ',', 'but', 'i', "'m", 'sure', 'it', 'will', 'be', 'fine', '.', '

In [18]:
import os
import pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict

class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))

    def preprocess(self, text):
        text = text.lower()
        tokens = word_tokenize(text)
        return [word for word in tokens if word.isalpha() and word not in self.stop_words]

class InvertedIndex:
    def __init__(self, directory_path):
        self.directory_path = directory_path
        self.index = defaultdict(set)

    def build_index(self):
        preprocessor = TextPreprocessor()
        for file_name in os.listdir(self.directory_path):
            file_path = os.path.join(self.directory_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                tokens = preprocessor.preprocess(text)
                for token in tokens:
                    self.index[token].add(os.path.basename(file_path))

    def save_to_file(self, file_name):
        with open(file_name, 'wb') as file:
            pickle.dump(self.index, file)

    @staticmethod
    def load_from_file(file_name):
        with open(file_name, 'rb') as file:
            return pickle.load(file)

class QueryProcessor:
    def __init__(self, inverted_index):
        self.inverted_index = inverted_index

    def process_query(self, query, operations):
        preprocessor = TextPreprocessor()
        query_terms = preprocessor.preprocess(query)
        if not query_terms:
            return set()

        result_set = self.inverted_index.get(query_terms[0], set())
        for i, operation in enumerate(operations):
            if i + 1 < len(query_terms):
                next_term_set = self.inverted_index.get(query_terms[i + 1], set())
                result_set = self.perform_operation(result_set, next_term_set, operation)
        return result_set

    @staticmethod
    def perform_operation(doc_set1, doc_set2, operation):
        if operation == 'AND':
            return doc_set1 & doc_set2
        elif operation == 'OR':
            return doc_set1 | doc_set2
        elif operation == 'AND NOT':
            return doc_set1 - doc_set2
        return doc_set1

if __name__ == "__main__":
    directory_path = '/content/drive/My Drive/IR_Assignment1/text_files/preprocessed_files'
    index = InvertedIndex(directory_path)
    index.build_index()
    index.save_to_file('inverted_index.pkl')

    loaded_index = InvertedIndex.load_from_file('inverted_index.pkl')
    processor = QueryProcessor(loaded_index)

    number_of_queries = int(input("Enter the number of queries: "))
    for i in range(number_of_queries):
        query = input(f"Enter query {i+1}: ")
        operations = input(f"Enter operations for query {i+1} separated by comma: ").split(', ')
        result_set = processor.process_query(query, operations)

        print(f"Query {i+1}: {query}")
        print(f"Documents retrieved: {len(result_set)}")
        print(f"Document names: {', '.join(sorted(result_set))}\n")


Enter the number of queries: 2
Enter query 1: Car bag in a canister
Enter operations for query 1 separated by comma: OR, AND NOT
Query 1: Car bag in a canister
Documents retrieved: 31
Document names: file118_preprocessed.txt, file166_preprocessed.txt, file174_preprocessed.txt, file264_preprocessed.txt, file313_preprocessed.txt, file363_preprocessed.txt, file3_preprocessed.txt, file404_preprocessed.txt, file459_preprocessed.txt, file466_preprocessed.txt, file542_preprocessed.txt, file573_preprocessed.txt, file665_preprocessed.txt, file682_preprocessed.txt, file686_preprocessed.txt, file698_preprocessed.txt, file699_preprocessed.txt, file738_preprocessed.txt, file73_preprocessed.txt, file746_preprocessed.txt, file780_preprocessed.txt, file797_preprocessed.txt, file860_preprocessed.txt, file863_preprocessed.txt, file864_preprocessed.txt, file886_preprocessed.txt, file892_preprocessed.txt, file930_preprocessed.txt, file942_preprocessed.txt, file956_preprocessed.txt, file981_preprocessed.tx

In [25]:
import os
import pickle
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

class TextProcessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))

    def preprocess(self, text):
        text = text.lower()
        tokens = word_tokenize(text)
        return [word for word in tokens if word.isalpha() and word not in self.stop_words]

def defaultdict_list():
    """Return a defaultdict with list as default factory."""
    return defaultdict(list)

def create_positional_index(directory_path, processor):
    positional_index = defaultdict(defaultdict_list)
    for file_name in filter(lambda name: name.endswith(".txt"), os.listdir(directory_path)):
        file_path = os.path.join(directory_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            tokens = processor.preprocess(text)
            for position, token in enumerate(tokens):
                positional_index[token][file_name].append(position)
    return positional_index

def save_object(obj, file_name):
    with open(file_name, 'wb') as file:
        pickle.dump(obj, file)

def load_object(file_name):
    with open(file_name, 'rb') as file:
        return pickle.load(file)

def process_phrase_query(query, positional_index, processor):
    tokens = processor.preprocess(query)
    if not tokens:
        return 0, []

    common_docs = set.intersection(*(set(positional_index[token].keys()) for token in tokens))
    valid_docs = []
    for doc in common_docs:
        positions = [positional_index[token][doc] for token in tokens]
        if any(all(p2 - p1 == 1 for p1, p2 in zip(pos_list, pos_list[1:])) for pos_list in zip(*positions)):
            valid_docs.append(doc)

    return len(valid_docs), valid_docs

if __name__ == "__main__":
    directory_path = '/content/drive/My Drive/IR_Assignment1/text_files/preprocessed_files'
    processor = TextProcessor()
    positional_index = create_positional_index(directory_path, processor)
    index_file_name = 'positional_index.pkl'
    save_object(positional_index, index_file_name)

    loaded_positional_index = load_object(index_file_name)

    number_of_queries = int(input("Enter the number of queries: "))
    queries = [input(f"Enter query {i+1}: ") for i in range(number_of_queries)]

    for i, query in enumerate(queries):
        count, documents = process_phrase_query(query, loaded_positional_index, processor)
        print(f"Number of documents retrieved for query {i+1} using positional index: {count}")
        print(f"Names of documents retrieved for query {i+1} using positional index: {', '.join(documents)}\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter the number of queries: 3
Enter query 1: it is a good in front for poutch
Enter query 2: it is good in reliable for fit
Enter query 3: it is a fit front poutch
Number of documents retrieved for query 1 using positional index: 0
Names of documents retrieved for query 1 using positional index: 

Number of documents retrieved for query 2 using positional index: 1
Names of documents retrieved for query 2 using positional index: file9_preprocessed.txt

Number of documents retrieved for query 3 using positional index: 1
Names of documents retrieved for query 3 using positional index: file9_preprocessed.txt

