In [48]:
import random
import re
import os
import math
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import defaultdict

In [49]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PBS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PBS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PBS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [50]:
STOPWORDS = stopwords.words('english')
LEMMATIZER = WordNetLemmatizer()

In [51]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\d+", "", text)
    tokens = word_tokenize(text)
    cleaned_tokens = [LEMMATIZER.lemmatize(word) for word in tokens if word not in STOPWORDS]
    return cleaned_tokens

In [52]:
def load_text_files(documents_path):
    documents = {}
    for filename in os.listdir(documents_path):
        if filename.endswith(".txt"):
            with open(os.path.join(documents_path, filename), 'r', encoding='utf-8') as file:
                documents[filename] = clean_text(file.read())
    return documents

In [53]:
def load_query(query_path):
    with open(query_path,'r') as file:
        return [line.strip() for line in file.readlines()]

In [54]:
def compute_statistics(docs):
    doc_count = len(docs)
    term_doc_freq = defaultdict(int)
    term_freq = defaultdict(lambda: defaultdict(int))

    for doc_id, words in docs.items():
        print(doc_id)
        word_set = set(words)
        for word in words:
            term_freq[doc_id][word] += 1
        for word in word_set:
            term_doc_freq[word] += 1

    return term_freq, term_doc_freq, doc_count

In [55]:
def compute_relevance_prob(query, term_freq, term_doc_freq, doc_count):
    scores = {}
    for doc_id in term_freq:
        score = 1.0
        for term in query:
            tf = term_freq[doc_id].get(term, 0)
            df = term_doc_freq.get(term, 0)
            p_term_given_relevant = (tf + 1) / (sum(term_freq[doc_id].values()) + len(term_doc_freq))
            p_term_given_not_relevant = (df + 1) / (doc_count - df + len(term_doc_freq))
            score *= (p_term_given_relevant / p_term_given_not_relevant)
        scores[doc_id] = score
    return scores

In [56]:
def retrieve_documents(path, query_path):
    docs = load_text_files(path)
    queries = load_query(query_path)

    term_freq, term_doc_freq, doc_count = compute_statistics(docs)

    for query in queries:
        query_terms = clean_text(query)
        scores = compute_relevance_prob(query_terms, term_freq, term_doc_freq, doc_count)
        ranked_docs = sorted(scores.items(), key=lambda item: item[1], reverse=True)
        print(f"Query: {query}")
        for filename, score in ranked_docs:
            print(f"Document: {filename}, Score: {score:.4f}")
        print()

In [57]:
def main():
    folder_path = './documents/'
    query_path = 'queries.txt'
    retrieve_documents(folder_path, query_path)

    def assign_random_relevance(queries, documents, relevance_scale=(0, 1)):
      relevance_scores = {}

      for query in queries:
          relevance_scores[query] = {}
          for doc in documents:
              relevance_scores[query][doc] = random.randint(relevance_scale[0], relevance_scale[1])

      return relevance_scores

    def save_relevance_scores_to_file(relevance_scores, output_file):
      with open(output_file, 'w') as f:
          for query, doc_scores in relevance_scores.items():
              f.write(f"{query}\n")
              for doc, score in doc_scores.items():
                  f.write(f"{doc}:{score}\n")
              f.write("\n")


    documents = load_text_files(folder_path)
    queries = load_query(query_path)

    random_relevance_scores = assign_random_relevance(queries, documents.keys())


    output_file = 'ajit_score.txt'
    save_relevance_scores_to_file(random_relevance_scores, output_file)

    print(f"Relevance scores saved to {output_file}")

if __name__ == "__main__":
    main()

10 undecided voters explain why they haven’t picked a side in this election.txt
2024 US election Kamala Harris_s transformation.txt
Arm the public with facts Microsoft billionaire fights US election disinformation.txt
Election 2024 Latest Trump and Harris campaign for undecided voters with just 6 weeks left.txt
Fears mount that election deniers could disrupt vote count in US swing.txt
Harris Had Stronger Debate, Polls Find, but the Race Remains Deadlocked.txt
Harris to condemn Trump in Georgia after news of abortion-related deaths.txt
Mounting North Korean threats await next US president.txt
Oprah, Swift and Clooney do celebrity endorsements matter in the US election.txt
Poll shows US voters think Harris, Trump on equal footing on economy, crime, Gaza war.txt
Pope Francis tells US Catholics to choose ‘lesser evil’ in coming election.txt
Presidential campaigns hit battleground states in very close races.txt
So it begins 2024 presidential election underway as early voting opens in states