In [10]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import math
from collections import defaultdict

In [9]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [25]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return tokens

def load_documents(folder_path):
    docs = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                docs[filename] = preprocess(file.read())
    return docs

# Compute term frequencies and document frequencies
def compute_statistics(docs):
    doc_count = len(docs)
    term_doc_freq = defaultdict(int)  # Document frequency (DF) for each term
    term_freq = defaultdict(lambda: defaultdict(int))  # Term frequency (TF) for each term in each document
    doc_length = {}  # Store document lengths for normalization
    total_doc_length = 0  # Store the total length of all documents

    for doc_id, words in docs.items():
        doc_length[doc_id] = len(words)
        total_doc_length += len(words)

        word_set = set(words)
        for word in words:
            term_freq[doc_id][word] += 1
        for word in word_set:
            term_doc_freq[word] += 1

    avg_doc_length = total_doc_length / doc_count
    return term_freq, term_doc_freq, doc_count, doc_length, avg_doc_length

def compute_bm25(query, term_freq, term_doc_freq, doc_count, doc_length, avg_doc_length, k1=1.5, b=0.75):
    scores = {}
    for doc_id in term_freq:
        score = 1
        for term in query:
            tf = term_freq[doc_id].get(term, 0)  # Term frequency in document
            df = term_doc_freq.get(term, 0)      # Document frequency
            if df > 0:
                idf = math.log((doc_count - df + 0.5) / (df + 0.5) + 1)  # Inverse document frequency (IDF)
                term_score = idf * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_length[doc_id] / avg_doc_length))))
                score += term_score
        scores[doc_id] = score
    return scores

def retrieve_documents(folder_path, queries, scaling_factor=0.7):
    docs = load_documents(folder_path)
    term_freq, term_doc_freq, doc_count, doc_length, avg_doc_length = compute_statistics(docs)

    # Open the results file in write mode
    with open('result.txt', 'w', encoding='utf-8') as result_file:
        for query in queries:
            processed_query = preprocess(query)  # Preprocess each query
            scores = compute_bm25(processed_query, term_freq, term_doc_freq, doc_count, doc_length, avg_doc_length)
            max_score = max(scores.values()) if scores else 1.0  # Avoid division by zero

            # Rescale the scores to make the maximum score 0.7
            for doc_id in scores:
                scores[doc_id] = (scores[doc_id] / max_score) * scaling_factor

            # Sort documents by score in descending order
            ranked_docs = sorted(scores.items(), key=lambda item: item[1], reverse=True)

            # Write results for the query to the result file
            result_file.write(f"Query: {' '.join(processed_query)}\n")
            for doc_id, score in ranked_docs:
                result_file.write(f"Document: {doc_id}, Score: {score:.4f}\n")
            result_file.write("\n")

folder_path = '/content/docs'
queries = [
    "Who is Alice",
    "The Mad Hatter's Tea Party",
    "The Queen of Hearts and the croquet game",
    "Alice's confrontation with the Queen of Hearts",
    "ORANGE MARMALADE",
    "she jumped up on to her feet in a moment:"

]
retrieve_documents(folder_path, queries)