# 1)

In [26]:
import os
import math
from collections import defaultdict

# Step 1: Tokenization and TF calculation
def tokenize(text):
    return text.lower().split()

def compute_tf(documents):
    tf = defaultdict(dict)
    for filename, article in documents.items():
        tokens = tokenize(article["content"])
        token_counts = defaultdict(int)
        for token in tokens:
            token_counts[token] += 1
        
        for token, count in token_counts.items():
            tf[filename][token] = 1 + math.log(count)
    return tf

# Step 2: Compute IDF
def compute_idf(documents):
    idf = defaultdict(float)
    total_docs = len(documents)
    doc_count = defaultdict(int)
    for article in documents.values():
        tokens = set(tokenize(article["content"]))
        for token in tokens:
            doc_count[token] += 1
    for token, count in doc_count.items():
        idf[token] = math.log(total_docs / count)
    return idf

# Step 3: Compute TF-IDF
def compute_tf_idf(tf, idf):
    tf_idf = defaultdict(dict)
    for filename, tf_values in tf.items():
        for token, tf_value in tf_values.items():
            tf_idf[filename][token] = tf_value * idf.get(token, 0)
    return tf_idf

# Step 4: Score documents based on query
def score_documents(query, tf, tf_idf):
    query_tokens = tokenize(query)
    
    # TF scoring
    tf_scores = defaultdict(float)
    for filename, tf_values in tf.items():
        for token in query_tokens:
            tf_scores[filename] += tf_values.get(token, 0)
    
    # TF-IDF scoring
    tf_idf_scores = defaultdict(float)
    for filename, tf_idf_values in tf_idf.items():
        for token in query_tokens:
            tf_idf_scores[filename] += tf_idf_values.get(token, 0)
    
    return tf_scores, tf_idf_scores

# Step 5: Categorize results
def categorize_results(scores, documents):
    categorized_results = defaultdict(list)
    for filename, score in scores.items():
        category = documents[filename]["category"]
        categorized_results[category].append((filename, score))
    return categorized_results

# Determine query category based on highest score
def determine_query_category(categorized_results):
    category_scores = defaultdict(float)
    for category, results in categorized_results.items():
        for _, score in results:
            category_scores[category] += score
    return max(category_scores, key=category_scores.get) if category_scores else None

# Load documents from folders
def load_documents(folder_path):
    documents = {}
    for category in os.listdir(folder_path):
        category_path = os.path.join(folder_path, category)
        if os.path.isdir(category_path):
            for filename in os.listdir(category_path):
                file_path = os.path.join(category_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    documents[file_path] = {"category": category, "content": content}
    return documents

# Main function
def main(queries, folder_path):
    documents = load_documents(folder_path)
    
    tf = compute_tf(documents)
    idf = compute_idf(documents)
    tf_idf = compute_tf_idf(tf, idf)
    
    results = {}
    for query in queries:
        tf_scores, tf_idf_scores = score_documents(query, tf, tf_idf)
        tf_categorized_results = categorize_results(tf_scores, documents)
        tf_idf_categorized_results = categorize_results(tf_idf_scores, documents)
        
        tf_category = determine_query_category(tf_categorized_results)
        tf_idf_category = determine_query_category(tf_idf_categorized_results)
        
        results[query] = {
            "tf": tf_categorized_results,
            "tf_idf": tf_idf_categorized_results,
            "tf_category": tf_category,
            "tf_idf_category": tf_idf_category
        }
    
    return results

# Example usage
folder_path = "Data"
queries = [
    "اهمیت عمر انسان برای پزشک",
    "سرمربی تیم در لیگ قهرمانان",
    "دانش تحقیقات در سلولهای بنیادی",
    "دین و علم در کالم شهردار تهران"
]

results = main(queries, folder_path)

for query, result in results.items():
    print(f"Query: {query}")
    print(f"Determined Category (TF): {result['tf_category']}")
    print(f"Determined Category (TF-IDF): {result['tf_idf_category']}")
    
    print("TF Results:")
    for category, res in result["tf"].items():
        print(f"  Category: {category}")
        for filename, score in res:
            print(f"    File: {filename}, Score: {score}")
    
    print("TF-IDF Results:")
    for category, res in result["tf_idf"].items():
        print(f"  Category: {category}")
        for filename, score in res:
            print(f"    File: {filename}, Score: {score}")
    print("-" * 80)

Query: اهمیت عمر انسان برای پزشک
Determined Category (TF): سیاسی
Determined Category (TF-IDF): سیاسی
TF Results:
  Category: اجتماعی
    File: Data\اجتماعی\13800711-txt-0073631_utf.txt, Score: 0.0
    File: Data\اجتماعی\13810227-txt-0127492_utf.txt, Score: 2.0
    File: Data\اجتماعی\13810320-txt-0132830_utf.txt, Score: 0.0
    File: Data\اجتماعی\13810821-txt-0172902_utf.txt, Score: 0.0
    File: Data\اجتماعی\13830627-txt-0431835_utf.txt, Score: 1.0
    File: Data\اجتماعی\13850502-txt-0751465_utf.txt, Score: 0.0
    File: Data\اجتماعی\13850506-txt-0754145_utf.txt, Score: 0.0
    File: Data\اجتماعی\13850723-txt-0802407_utf.txt, Score: 0.0
    File: Data\اجتماعی\13860630-txt-1002033_utf.txt, Score: 0.0
    File: Data\اجتماعی\13870730-txt-1219770_utf.txt, Score: 0.0
  Category: اديان
    File: Data\اديان\13860108-txt-0896151_utf.txt, Score: 0.0
    File: Data\اديان\13860117-txt-0884507_utf.txt, Score: 3.1972245773362196
    File: Data\اديان\13860431-txt-0963964_utf.txt, Score: 0.0
    File

---
# 2) Effect of Removing Stop Words from the Query on Outputs:

1. Improved Relevance:

 By removing stop words, the focus of the query is shifted to more meaningful and content-rich terms.

2. Reduced Noise:

Stop words tend to add noise to the text data, leading to less accurate TF and TF-IDF scores. By removing them, the model can better differentiate between documents based on the remaining, more meaningful terms.

3. Smaller Vocabulary Size:

The vocabulary size is reduced, which simplifies the computation and can improve the efficiency of the system. A smaller vocabulary means fewer terms to keep track of in the TF and IDF calculations.

4. Potential Overlook of Context:

While stop words are often semantically weak, they sometimes provide necessary context. Removing them might lead to a loss of important contextual information, potentially changing the meaning of the query.

In [23]:
# Example usage
folder_path = "Data"
original_query = "دانش تحقیقات در سلولهای بنیادی"
stop_removed_query = "دانش تحقیقات سلولهای بنیادی"
queries = [original_query, stop_removed_query]

results = main(queries, folder_path)

for query, result in results.items():
    print(f"Query: {query}")
    print(f"Determined Category (TF): {result['tf_category']}")
    print(f"Determined Category (TF-IDF): {result['tf_idf_category']}")
    
    print("TF Results:")
    for category, res in result["tf"].items():
        print(f"  Category: {category}")
        for filename, score in res:
            print(f"    File: {filename}, Score: {score}")
    
    print("TF-IDF Results:")
    for category, res in result["tf_idf"].items():
        print(f"  Category: {category}")
        for filename, score in res:
            print(f"    File: {filename}, Score: {score}")
    print("-" * 80)

Query: دانش تحقیقات در سلولهای بنیادی
Determined Category (TF): سیاسی
Determined Category (TF-IDF): مسائل راهبردي ايران
TF Results:
  Category: اجتماعی
    File: Data\اجتماعی\13800711-txt-0073631_utf.txt, Score: 3.833213344056216
    File: Data\اجتماعی\13810227-txt-0127492_utf.txt, Score: 7.2574953720277815
    File: Data\اجتماعی\13810320-txt-0132830_utf.txt, Score: 5.653960350157523
    File: Data\اجتماعی\13810821-txt-0172902_utf.txt, Score: 7.41610040220442
    File: Data\اجتماعی\13830627-txt-0431835_utf.txt, Score: 6.3981627015177525
    File: Data\اجتماعی\13850502-txt-0751465_utf.txt, Score: 6.093750200806762
    File: Data\اجتماعی\13850506-txt-0754145_utf.txt, Score: 6.3471075307174685
    File: Data\اجتماعی\13850723-txt-0802407_utf.txt, Score: 5.59511985013459
    File: Data\اجتماعی\13860630-txt-1002033_utf.txt, Score: 5.9344739331306915
    File: Data\اجتماعی\13870730-txt-1219770_utf.txt, Score: 5.919980925828125
  Category: اديان
    File: Data\اديان\13860108-txt-0896151_utf.tx

---
---