In [8]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
import os

folder_path = '/content/drive/My Drive/small_docs/'
#folder_path = '/content/drive/My Drive/larges_docs/full_docs/'

documents = []

# Iterating through all text files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            content = file.read()
            documents.append(content)

print(f"Loaded {len(documents)} documents")

Loaded 1557 documents


In [10]:
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import json

# Download necessary NLTK packages
nltk.download('punkt')
nltk.download('stopwords')

# Initialize the stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Define the full preprocessing function
def preprocess_text(text):
    # Step 1: Tokenization
    tokens = word_tokenize(text)

    # Step 2: Case Formatting (lowercasing)
    tokens = [word.lower() for word in tokens]

    # Step 3: Removing punctuation (keeping only alphanumeric tokens)
    tokens = [word for word in tokens if word.isalnum()]

    # Step 4: Stopword Removal
    tokens = [word for word in tokens if word not in stop_words]

    # Step 5: Stemming (reducing words to base forms)
    tokens = [stemmer.stem(word) for word in tokens]

    return tokens

original_directory = '/content/drive/My Drive/small_docs/'
#original_directory = '/content/drive/My Drive/larges_docs/full_docs/'

# Create the inverted index dictionary
inverted_index = {}

# Get a list of only the text files in the original directory
original_filenames = sorted([filename for filename in os.listdir(original_directory) if filename.endswith(".txt")])

# Build the inverted index
for doc_id, filename in enumerate(original_filenames):
    file_path = os.path.join(original_directory, filename)

    # Read the content of the document
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

        # Apply full preprocessing pipeline to the content
        tokens = preprocess_text(content)

        # Populate the inverted index with positions
        for pos, token in enumerate(tokens):
            if token not in inverted_index:
                inverted_index[token] = {filename: [pos]}
            else:
                if filename not in inverted_index[token]:
                    inverted_index[token][filename] = [pos]
                else:
                    inverted_index[token][filename].append(pos)

# Save the inverted index as a JSON file
inverted_index_path = '/content/drive/My Drive/inverted_index.json'
#inverted_index_path = '/content/drive/My Drive/larges_docs/inverted_index.json'
with open(inverted_index_path, 'w', encoding='utf-8') as index_file:
    json.dump(inverted_index, index_file, indent=4)

print(f"Inverted index with positions saved at: {inverted_index_path}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Inverted index with positions saved at: /content/drive/My Drive/inverted_index.json


In [11]:
def search_inverted_index(query, inverted_index):
    # Preprocess the query (tokenize, lowercase, remove stopwords, etc.)
    query_tokens = preprocess_text(query)

    # Find documents matching each query token
    result_docs = []
    for token in query_tokens:
        if token in inverted_index:
            result_docs.append(set(inverted_index[token]))

    if result_docs:
        relevant_docs = set.intersection(*result_docs)
        return relevant_docs
    else:
        return set()
query = "types of road hugger tires"


inverted_index_path = '/content/drive/My Drive/inverted_index.json'

with open(inverted_index_path, 'r', encoding='utf-8') as file:
    inverted_index = json.load(file)

results = search_inverted_index(query, inverted_index)

if results:
    print(f"Documents matching the query '{query}': {results}")
else:
    print(f"No documents found for the query '{query}'")

Documents matching the query 'types of road hugger tires': {'output_80.txt'}


In [12]:
import json
import math

# Load the inverted index from its location
inverted_index_path = '/content/drive/My Drive/inverted_index.json'
with open(inverted_index_path, 'r', encoding='utf-8') as index_file:
    inverted_index = json.load(index_file)

doc_lengths = {}

# Calculate document lengths
for term, doc_dict in inverted_index.items():
    for doc, positions in doc_dict.items():
        if doc not in doc_lengths:
            doc_lengths[doc] = len(positions)
        else:
            doc_lengths[doc] += len(positions)
total_docs = len(doc_lengths)

def calculate_tf(term, doc):
    term_count = len(inverted_index[term][doc])
    return (1 + math.log(term_count)) / (1 + math.log(doc_lengths[doc]))

# Function to calculate inverse document frequency (IDF)
def calculate_idf(term):
    doc_count = len(inverted_index[term])
    return math.log((total_docs + 1) / (doc_count + 1)) + 1

# Function to handle phrase queries using positional data
def check_phrase_in_document(query_tokens, doc, inverted_index):
    """Check if query tokens appear as a consecutive phrase in the document."""
    positions_list = [inverted_index[token][doc] for token in query_tokens if token in inverted_index and doc in inverted_index[token]]

    if not positions_list or len(positions_list) != len(query_tokens):
        return False

    # Check for consecutive positions
    for start_pos in positions_list[0]:
        match = True
        for i in range(1, len(positions_list)):
            if (start_pos + i) not in positions_list[i]:
                match = False
                break
        if match:
            return True
    return False

# Function to handle phrase queries
def score_phrase_query(query_tokens, inverted_index):
    relevant_docs = []
    for doc in doc_lengths.keys():
        if check_phrase_in_document(query_tokens, doc, inverted_index):
            relevant_docs.append(doc)
    return relevant_docs


def calculate_vector_magnitude(vector):
    """Calculate the magnitude of a vector for normalization."""
    return math.sqrt(sum([value ** 2 for value in vector]))

def score_documents(query_tokens, inverted_index):
    scores = {}
    query_term_weights = {}
    query_vector_magnitude = 0

    # Calculate IDF for query terms and build the query vector
    for term in set(query_tokens):
        if term in inverted_index:
            idf = calculate_idf(term)
            query_term_weights[term] = idf
            query_vector_magnitude += idf ** 2

    query_vector_magnitude = math.sqrt(query_vector_magnitude)

    for term in query_tokens:
        if term in inverted_index:
            idf = query_term_weights[term]
            for doc in inverted_index[term]:
                tf = calculate_tf(term, doc)
                tf_idf = tf * idf
                if doc not in scores:
                    scores[doc] = tf_idf ** 2
                else:
                    scores[doc] += tf_idf ** 2

    # Normalize document scores
    for doc in scores:
        doc_vector_magnitude = calculate_vector_magnitude([calculate_tf(term, doc) * query_term_weights[term] for term in query_tokens if term in inverted_index and doc in inverted_index[term]])
        if doc_vector_magnitude != 0 and query_vector_magnitude != 0:
            scores[doc] = scores[doc] / (doc_vector_magnitude * query_vector_magnitude)

    return sorted(scores.items(), key=lambda item: item[1], reverse=True)


query = "types of road hugger tires"

#query = pd.read_csv('/content/drive/My Drive/small_queries.csv')

query_tokens = preprocess_text(query)
results = score_documents(query_tokens, inverted_index)

if results:
    print(f"Ranked documents for the query '{query}':")
    for doc, score in results:
        print(f"{doc}: {score:.4f}")
else:
    print(f"No documents found for the query '{query}'")


Ranked documents for the query 'types of road hugger tires':
output_590.txt: 0.3556
output_965.txt: 0.3289
output_58.txt: 0.2692
output_655.txt: 0.1979
output_483.txt: 0.1921
output_171.txt: 0.1907
output_1220.txt: 0.1853
output_842.txt: 0.1775
output_38.txt: 0.1771
output_986.txt: 0.1755
output_203.txt: 0.1697
output_1256.txt: 0.1692
output_1287.txt: 0.1579
output_1174.txt: 0.1556
output_650.txt: 0.1538
output_1176.txt: 0.1532
output_911.txt: 0.1451
output_1369.txt: 0.1440
output_544.txt: 0.1431
output_1445.txt: 0.1427
output_1038.txt: 0.1402
output_352.txt: 0.1392
output_35.txt: 0.1339
output_649.txt: 0.1338
output_1467.txt: 0.1332
output_1207.txt: 0.1301
output_1520.txt: 0.1275
output_488.txt: 0.1247
output_622.txt: 0.1232
output_880.txt: 0.1225
output_560.txt: 0.1220
output_943.txt: 0.1217
output_469.txt: 0.1212
output_1326.txt: 0.1191
output_80.txt: 0.1187
output_971.txt: 0.1185
output_1453.txt: 0.1160
output_966.txt: 0.1140
output_930.txt: 0.1138
output_674.txt: 0.1135
output_123

In [13]:
import pandas as pd
import json
import math

# Load the inverted index
inverted_index_path = '/content/drive/My Drive/inverted_index.json'
with open(inverted_index_path, 'r', encoding='utf-8') as index_file:
    inverted_index = json.load(index_file)

# Calculate document lengths (if not already done earlier)
doc_lengths = {}
for term, doc_dict in inverted_index.items():
    for doc, positions in doc_dict.items():
        if doc not in doc_lengths:
            doc_lengths[doc] = len(positions)
        else:
            doc_lengths[doc] += len(positions)
total_docs = len(doc_lengths)

# Functions for TF, IDF, scoring, etc. (use your defined functions)

# Load the CSV file containing the test queries
queries_df = pd.read_csv('/content/drive/My Drive/small_queries.csv')

all_queries_results = {}

# Process each query and retrieve the top 10 documents
for index, row in queries_df.iterrows():
    query_number = row['Query number']
    query_text = row['Query']

    query_tokens = preprocess_text(query_text)

    results = score_documents(query_tokens, inverted_index)

    # Store the top 10 results
    all_queries_results[query_number] = [doc for doc, score in results[:10]]

output_df = pd.DataFrame([
    {'Query number': query_id, 'Document': doc}
    for query_id, docs in all_queries_results.items()
    for doc in docs
])
output_path = '/content/drive/My Drive/results.csv'
output_df.to_csv(output_path, index=False)

print(f"Test output saved to {output_path}")


Test output saved to /content/drive/My Drive/results.csv


In [18]:
import json
import math
import pandas as pd

inverted_index_path = '/content/drive/My Drive/inverted_index.json'
with open(inverted_index_path, 'r', encoding='utf-8') as index_file:
    inverted_index = json.load(index_file)

doc_lengths = {}

for term, doc_dict in inverted_index.items():
    for doc, positions in doc_dict.items():
        if doc not in doc_lengths:
            doc_lengths[doc] = len(positions)
        else:
            doc_lengths[doc] += len(positions)

total_docs = len(doc_lengths)
def calculate_tf(term, doc):
    term_count = len(inverted_index[term][doc])
    return (1 + math.log(term_count)) / (1 + math.log(doc_lengths[doc]))
# Function to calculate inverse document frequency (IDF)
def calculate_idf(term):
    doc_count = len(inverted_index[term])
    return math.log((total_docs + 1) / (doc_count + 1)) + 1

# Function to calculate vector magnitude for normalization
def calculate_vector_magnitude(vector):
    return math.sqrt(sum([value ** 2 for value in vector]))

# Function to score documents using TF-IDF with cosine normalization
def score_documents(query_tokens, inverted_index):
    scores = {}
    query_term_weights = {}
    query_vector_magnitude = 0

    # Calculate IDF for query terms and build the query vector
    for term in set(query_tokens):
        if term in inverted_index:
            idf = calculate_idf(term)
            query_term_weights[term] = idf
            query_vector_magnitude += idf ** 2

    query_vector_magnitude = math.sqrt(query_vector_magnitude)

    for term in query_tokens:
        if term in inverted_index:
            idf = query_term_weights[term]
            for doc in inverted_index[term]:
                tf = calculate_tf(term, doc)
                tf_idf = tf * idf
                if doc not in scores:
                    scores[doc] = tf_idf ** 2
                else:
                    scores[doc] += tf_idf ** 2

    # Normalize document scores
    for doc in scores:
        doc_vector_magnitude = calculate_vector_magnitude([
            calculate_tf(term, doc) * query_term_weights[term]
            for term in query_tokens if term in inverted_index and doc in inverted_index[term]
        ])
        if doc_vector_magnitude != 0 and query_vector_magnitude != 0:
            scores[doc] = scores[doc] / (doc_vector_magnitude * query_vector_magnitude)

    return sorted(scores.items(), key=lambda item: item[1], reverse=True)

# Function to calculate Precision at K (P@K)
def precision_at_k(retrieved_docs, relevant_docs, k):
    retrieved_k = retrieved_docs[:k]
    relevant_count = sum([1 for doc in retrieved_k if doc in relevant_docs])
    return relevant_count / k

# Function to calculate Recall at K (R@K)
def recall_at_k(retrieved_docs, relevant_docs, k):
    retrieved_k = retrieved_docs[:k]
    relevant_count = sum([1 for doc in retrieved_k if doc in relevant_docs])
    return relevant_count / len(relevant_docs) if len(relevant_docs) > 0 else 0

# Function to calculate Mean Average Precision at K (MAP@K)
def mean_average_precision_at_k(all_queries_results, all_queries_relevant_docs, k):
    average_precisions = []
    for query_id, retrieved_docs in all_queries_results.items():
        relevant_docs = all_queries_relevant_docs.get(query_id, [])
        precisions = [
            precision_at_k(retrieved_docs, relevant_docs, i + 1)
            for i in range(min(k, len(retrieved_docs)))
            if retrieved_docs[i] in relevant_docs
        ]
        if precisions:
            average_precisions.append(sum(precisions) / len(relevant_docs))
    return sum(average_precisions) / len(average_precisions) if average_precisions else 0

# Function to calculate Mean Average Recall at K (MAR@K)
def mean_average_recall_at_k(all_queries_results, all_queries_relevant_docs, k):
    recalls = []
    for query_id, retrieved_docs in all_queries_results.items():
        relevant_docs = all_queries_relevant_docs.get(query_id, [])
        recall = recall_at_k(retrieved_docs, relevant_docs, k)
        recalls.append(recall)
    return sum(recalls) / len(recalls) if recalls else 0

# Load the CSV file containing the queries (replace with your file path)
queries_df = pd.read_csv('/content/drive/My Drive/small_queries.csv')  # Replace with your actual path


# Load your result.csv containing the ground truth data
ground_truth_path = ('/content/drive/My Drive/results.csv')  # Replace with your actual path
ground_truth_df = pd.read_csv(ground_truth_path)

# Create the ground truth dictionary
all_queries_relevant_docs = {}
for _, row in ground_truth_df.iterrows():
    query_number = row['Query number']
    document = row['Document']

    if query_number not in all_queries_relevant_docs:
        all_queries_relevant_docs[query_number] = [document]
    else:
        all_queries_relevant_docs[query_number].append(document)



for index, row in queries_df.iterrows():
    query_number = row['Query number']
    query_text = row['Query']

    # Preprocess the query text
    query_tokens = preprocess_text(query_text)

    # Get the ranked results for the query
    results = score_documents(query_tokens, inverted_index)

    # Store the top 10 results for each query
    all_queries_results[query_number] = [doc for doc, score in results[:10]]

# Compute MAP@3, MAP@10, MAR@3, MAR@10
map_at_3 = mean_average_precision_at_k(all_queries_results, all_queries_relevant_docs, 3)
map_at_10 = mean_average_precision_at_k(all_queries_results, all_queries_relevant_docs, 10)
mar_at_3 = mean_average_recall_at_k(all_queries_results, all_queries_relevant_docs, 3)
mar_at_10 = mean_average_recall_at_k(all_queries_results, all_queries_relevant_docs, 10)

print(f"MAP@3: {map_at_3:.4f}")
print(f"MAP@10: {map_at_10:.4f}")
print(f"MAR@3: {mar_at_3:.4f}")
print(f"MAR@10: {mar_at_10:.4f}")


MAP@3: 0.3205
MAP@10: 1.0000
MAR@3: 0.3179
MAR@10: 0.9919
