In [4]:
import os
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import math

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [8]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return tokens

def term_frequency(term, document):
    term_count = document.count(term)
    total_terms = len(document)
    return term_count / total_terms if total_terms > 0 else 0

def inverse_document_frequency(term, all_documents):
    num_docs_containing_term = sum(1 for document in all_documents if term in document)
    return math.log(len(all_documents) / (1 + num_docs_containing_term))

def calculate_precision_at_k(relevant_docs, ranked_docs, k):
    if not ranked_docs:
        return 0.0
    top_k_docs = ranked_docs[:k]
    relevant_retrieved = sum(1 for doc_id, _ in top_k_docs if doc_id in relevant_docs)
    return relevant_retrieved / k

corpus_dir = '/content/docs'

documents = []
filenames = []
for filename in os.listdir(corpus_dir):
    if filename.endswith(".txt"):
        with open(os.path.join(corpus_dir, filename), 'r', encoding='utf-8') as file:
            doc_text = file.read()
            documents.append(preprocess(doc_text))
            filenames.append(filename)

relevance_dict = {
    "Tasmanian Aboriginal And PResident": ["doc2.txt", "doc4.txt"],
    "Scientists typhoons and hurricanes Francis": ["doc10.txt", "doc5.txt"],
    "Oceangate’s Titan and Space": ["doc5.txt", "doc9.txt"],
}

queries = [
    "Tasmanian Aboriginal And PResident",
    "Scientists typhoons and hurricanes Francis",
    "Oceangate’s Titan and Space",
]

K = 5

with open("result.txt", "w") as result_file:
    for query in queries:
        processed_query = preprocess(query)

        all_terms = set([term for doc in documents for term in doc]).union(set(processed_query))

        tfidf_documents = []
        for doc in documents:
            tfidf_vector = []
            for term in all_terms:
                tf = term_frequency(term, doc)
                idf = inverse_document_frequency(term, documents)
                tfidf_vector.append(tf * idf)
            tfidf_documents.append(tfidf_vector)

        tfidf_query = []
        for term in all_terms:
            tf = term_frequency(term, processed_query)
            idf = inverse_document_frequency(term, documents)
            tfidf_query.append(tf * idf)

        tfidf_documents = np.array(tfidf_documents)
        tfidf_query = np.array([tfidf_query])

        cosine_similarities = cosine_similarity(tfidf_query, tfidf_documents).flatten()

        ranked_results = sorted(zip(filenames, cosine_similarities), key=lambda x: x[1], reverse=True)

        relevant_files = set(relevance_dict.get(query, []))
        precision_at_k = calculate_precision_at_k(relevant_files, ranked_results, K)

        result_file.write(f"Cosine similarities for Query: '{query}'\n")
        for i, (filename, score) in enumerate(ranked_results):
            result_file.write(f"Cosine similarity between Query and '{filename}': {score:.4f}\n")
        result_file.write(f"Precision@{K} for Query: '{query}': {precision_at_k:.4f}\n\n")

print("Cosine similarities and Precision at K written to result.txt")


Cosine similarities and Precision at K written to result.txt
