In [None]:
%pip install nltk

import os
import math
from collections import defaultdict
import re
import nltk
nltk.download('all')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

In [None]:
def preprocess(text):
    # Lowercase
    text = text.lower()
    
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Tokenize
    text = word_tokenize(text)
    
    # Remove stopwords and lemmatize
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    
    # Stemming
    stemmer = nltk.stem.PorterStemmer()
    text = [stemmer.stem(word) for word in text]
    
    text_with_soundex = []
    for word in text:
        text_with_soundex.append(word)  # the original word
        soundex_code = soundex(word)    # the Soundex version
        if soundex_code != word:        # avoiding duplicates
            text_with_soundex.append(soundex_code)
    
    return text_with_soundex

In [None]:
def soundex(word):
    if not word:
        return ""
    
    word = word.upper()
    
    # first letter
    soundex_code = word[0]
    
    # replacing consonants with digits
    mapping = {
        'B': '1', 'F': '1', 'P': '1', 'V': '1',
        'C': '2', 'G': '2', 'J': '2', 'K': '2', 'Q': '2', 'S': '2', 'X': '2', 'Z': '2',
        'D': '3', 'T': '3',
        'L': '4',
        'M': '5', 'N': '5',
        'R': '6'
    }
    
    # apply mapping
    for char in word[1:]:
        if char in mapping:
            code = mapping[char]
            # Don't add duplicate consecutive codes
            if soundex_code[-1] != code:
                soundex_code += code
    
    # removing vowels
    vowels = 'AEIOUYHW'
    filtered_code = soundex_code[0]  # again keeping first letter
    for char in soundex_code[1:]:
        if char not in vowels:
            filtered_code += char
    
    # adding with zeros or truncate to 4 characters
    filtered_code = (filtered_code + '000')[:4]
    
    return filtered_code

In [None]:
# Read documents from a directory
def read_documents(directory):
    documents = {}
    try:
        for filename in os.listdir(directory):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                tokenized = preprocess(text)
                documents[filename] = tokenized
    except Exception as e:
        print('An error occurred:', e)
    return documents

In [None]:
# Creating the dictionary of all words in the documents
def create_dictionary(documents):
    dictionary = set()
    for document in documents.values():
        dictionary.update(document)
    return dictionary

In [None]:
# Computing the term frequency

def term_frequency(documents):
    tf = defaultdict(lambda: defaultdict(int))
    for filename, tokens in documents.items():
        for token in tokens:
            tf[filename][token] += 1
    return tf

# Weighted term frequency

def weighted_term_frequency(tf):
    return 1 + math.log10(tf) if tf > 0 else 0

In [None]:
def calculate_document_frequencies(posting_list):
    # Implement document frequency calculation
    document_frequencies = {}
    for term, postings in posting_list.items():
        document_frequencies[term] = len(postings)
    return document_frequencies


In [None]:
# Computing the inverse document frequency
def postings_list(documents, unique_words):
    # Initialize postings list as a defaultdict of lists
    postings = defaultdict(list)
    
    # Compute term frequency for the documents
    tf = term_frequency(documents)
    
    # Iterate over each unique word
    for word in unique_words:
        # Iterate over each document and its tokens
        for filename, tokens in documents.items():
            # Check if the word is in the document tokens
            if word in tokens:
                try:
                    # Calculate weighted term frequency
                    wt_tf = weighted_term_frequency(tf[filename][word])
                except KeyError:
                    # Handle case where term is not found in the document
                    print(f"Warning: Term '{word}' not found in document {filename}")
                    wt_tf = 0
                # Append the filename and weighted term frequency to the postings list
                postings[word].append((filename, wt_tf))
    
    return postings

In [None]:
# Computing the document length

def doc_length(tf):
    doc_lengths = defaultdict(float)
    for filename, terms in tf.items():
        length = 0
        for term, freq in terms.items():
            length += (1 + math.log10(freq)) ** 2
        doc_lengths[filename] = math.sqrt(length)
    return doc_lengths

In [None]:
# Calculate cosine similarity for the given document
def cosine_similarity(query_wt, doc_wt, doc_len, doc_id):
    similarity = {}

    # Calculate dot product of query and document weights
    dot_product = 0
    for term in query_wt:
        if term in doc_wt:
            dot_product += query_wt[term] * doc_wt[term]
    
    # Calculate magnitude of the query vector
    query_magnitude = 0
    for weight in query_wt.values():
        query_magnitude += weight ** 2
    query_magnitude = math.sqrt(query_magnitude)
    
    # Calculate cosine similarity
    similarity[doc_id] = dot_product / (query_magnitude * doc_len[doc_id])
    
    return similarity


In [None]:
def calculate_tf(freq):
    # Calculate term frequency using log normalization
    return 1 + math.log10(freq) if freq > 0 else 0

def calculate_idf(df, N):
    # Calculate inverse document frequency
    return math.log10(N / df)

def rank_documents(documents, query, posting_list, document_frequencies, unique_words):
    N = len(documents)  # Total number of documents
    query_tokens = preprocess(query)  # Preprocess the query
    unique_words_query = set(query_tokens)  # Unique words in the query
    unique_words = unique_words.union(unique_words_query)  # Combine unique words from documents and query
    query_vector = {}

    # Calculate query tf-idf weights (ltc scheme)
    for word in unique_words:
        tf = query_tokens.count(word)  # Term frequency in the query
        df = document_frequencies.get(word, 0)  # Document frequency of the term
        if df > 0:
            idf = calculate_idf(df, N)  # Inverse document frequency
            query_vector[word] = calculate_tf(tf) * idf  # tf-idf weight for the query term

    # Calculate document lengths (for cosine similarity)
    doc_lengths = defaultdict(float)
    for word, postings in posting_list.items():
        for doc, log_tf in postings:
            doc_lengths[doc] += log_tf ** 2
    for doc in doc_lengths:
        doc_lengths[doc] = math.sqrt(doc_lengths[doc])  # Finalize document lengths

    # Calculate cosine similarities
    similarities = {}
    for doc_name in documents.keys():
        doc_vector = {}
        for word in unique_words:
            posting = posting_list.get(word, [])
            for doc, log_tf in posting:
                if doc == doc_name:
                    doc_vector[word] = log_tf  # Document vector for the term

        # Compute cosine similarity
        similarity = cosine_similarity(query_vector, doc_vector, doc_lengths, doc_name)
        similarities.update(similarity)

    # Sort documents by similarity and return all ranked documents
    # ranked_docs = sorted(similarities.items(), key=lambda item: (-item[1], item[0]))
    
    def extract_doc_id(filename):
    numbers = re.findall(r'\d+', filename)
    return int(numbers[0]) if numbers else 0

    # Sort by similarity (descending) then by docID (ascending)
    ranked_docs = sorted(similarities.items(), 
                        key=lambda item: (-item[1], extract_doc_id(item[0])))
    return ranked_docs

