In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import re
import random
import numpy as np

# Function to extract page ID from the filename
def get_page_id(file_name):
    match = re.search(r'page(\d+)', file_name)
    if match:
        return int(match.group(1))
    else:
        return None

# Function to extract page name from the filename
def get_page_name(file_name):
    # Extract page name from the filename by removing the file extension
    return os.path.splitext(file_name)[0]

# Function to preprocess the document content
def preprocess_document(document_content):
    # Tokenize the content by splitting on spaces (you can use more advanced tokenization)
    words = document_content.split()

    # Remove stop words, special characters, and sentence delimiters
    stop_words = set(["the", "and", "is", "in", "it", "an", "to", "of", "as", "by"])
    words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

    # Calculate total weight (word count) and total count word
    total_weight = len(words)
    total_count_word = len(set(words))

    return total_weight, total_count_word, words

# Function to perform document indexing
def document_indexing(dataset_path):
    pages_information = {}
    words_information = {}

    for root, _, files in os.walk(dataset_path):
        for file_name in files:
            if file_name.endswith(".txt"):
                with open(os.path.join(root, file_name), 'r', encoding='utf-8') as file:
                    document_content = file.read()
                    total_weight, total_count_word, words = preprocess_document(document_content)

                    page_id = get_page_id(file_name)
                    page_name = get_page_name(file_name)

                    # Store page information
                    pages_information[page_id] = {
                        "page_name": page_name,
                        "total_weight": total_weight,
                        "total_count_word": total_count_word,
                    }

                    # Update words information
                    for word in words:
                        if word not in words_information:
                            words_information[word] = {"pages_list": [page_id]}
                        else:
                            words_information[word]["pages_list"].append(page_id)

    return pages_information, words_information

# Function to perform query search
def query_search(user_query, words_information):
    query_words = user_query.lower().split()
    id_list = []

    for word in query_words:
        if word in words_information:
            id_list.extend(words_information[word]["pages_list"])

    return list(set(id_list))  # Remove duplicates

# Function to perform the modified GA procedure
def modified_genetic_algorithm(population_size, num_iterations):
    # Replace this with your actual genetic algorithm logic
    # Here, we'll just generate random fitness values for demonstration
    fitness_values = [random.random() for _ in range(population_size)]

    # Return the best fitness value after the specified number of iterations
    best_fitness = max(fitness_values)
    return best_fitness

# Example usage
if __name__ == "__main__":
    dataset_path = "/content/drive/MyDrive/Major_Project/Project_code/webkb-data/webkb"

    # Algorithm 1: Document Indexing
    pages_information, words_information = document_indexing(dataset_path)

    # Algorithm 2: Query Search
    user_query = "Required"
    relevant_page_ids = query_search(user_query, words_information)

    # Algorithm 3: Modified Genetic Algorithm
    population_size = 50
    num_iterations = 15
    best_fitness = modified_genetic_algorithm(population_size, num_iterations)

    # Print results
    print("Pages Information:")
    for page_id, page_info in pages_information.items():
        print(f"Page ID: {page_id}, Page Name: {page_info['page_name']}, Total Weight: {page_info['total_weight']}, Total Count Word: {page_info['total_count_word']}")

    print("\nWords Information:")
    for word, word_info in words_information.items():
        print(f"Word: {word}, Pages List: {word_info['pages_list']}")

    print("\nRelevant Page IDs for Query:", relevant_page_ids)
    print("Best Fitness Value:", best_fitness)


Pages Information:
Page ID: None, Page Name: http_^^www.cs.clemson.edu^~madhu^usa, Total Weight: 1085, Total Count Word: 649

Words Information:
Word: 14, Pages List: [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
Word: jan, Pages List: [None, None, None, None, None, None, None]
Word: 1997, Pages List: [None, None, None, None, None, None, None]
Word: gmt, Pages List: [None, None, None, None, None, None, None, None, None, None, None, None, None, None]
Word: 15, Pages List: [None, None, None, None, None, None, None, None, None, None, None, None]
Word: feb, Pages List: [None]
Word: 1996, Pages List: [None, None, None]
Word: 3082, Pages List: [None]
Word: problem, Pages List: [None, None, None, None, None]
Word: 1, Pages List: [None, None, None, None, None, None, None, None, None, None, None, None]
Word: part, Pages List: [None, None, None, None, None, None, None, None]
Word: a, Pages List: [None, None, None, None, None, None, None, None, N

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# Ground truth for relevant documents, you can replace this with your actual ground truth
# Format: {query_id: [list of relevant document IDs]}
ground_truth = {
    'query_1': [1, 2, 3],
    'query_2': [4, 5],


# Retrieved documents for each query
# Format: {query_id: [list of retrieved document IDs]}
retrieved_documents = {
    'query_1': [1, 3, 4],
    'query_2': [4, 5, 6],

}

# Initialize empty lists for ground truth labels and retrieved document labels
ground_truth_labels = []
retrieved_doc_labels = []

# Align the data
for query_id, relevant_docs in ground_truth.items():
    retrieved_docs = retrieved_documents.get(query_id, [])
    for doc_id in retrieved_docs:
        if doc_id in relevant_docs:
            ground_truth_labels.append(1)  # Document is relevant
        else:
            ground_truth_labels.append(0)  # Document is not relevant
        retrieved_doc_labels.append(1 if doc_id in retrieved_docs else 0)

# Calculate evaluation metrics
accuracy = accuracy_score(ground_truth_labels, retrieved_doc_labels)
recall = recall_score(ground_truth_labels, retrieved_doc_labels)
precision = precision_score(ground_truth_labels, retrieved_doc_labels)
f1 = f1_score(ground_truth_labels, retrieved_doc_labels)

print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)


Accuracy: 0.6666666666666666
Recall: 1.0
Precision: 0.6666666666666666
F1 Score: 0.8


In [None]:
# Function to preprocess the document content
def preprocess_document(document_content):
    # Tokenize the content by splitting on spaces (you can use more advanced tokenization)
    words = document_content.split()

    # Remove stop words, special characters, and sentence delimiters
    stop_words = set(["the", "and", "is", "in", "it", "an", "to", "of", "as", "by"])
    words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

    # Calculate total weight (word count) and total count word
    total_weight = len(words)
    total_count_word = len(set(words))

    return total_weight, total_count_word, words

# Function to perform document indexing
def document_indexing(dataset_path):
    pages_information = {}
    words_information = {}

    for root, _, files in os.walk(dataset_path):
        for file_name in files:
            if file_name.endswith(".txt"):
                with open(os.path.join(root, file_name), 'r', encoding='utf-8') as file:
                    document_content = file.read()
                    total_weight, total_count_word, words = preprocess_document(document_content)

                    page_id = get_page_id(file_name)
                    page_name = get_page_name(file_name)

                    # Store page information
                    pages_information[page_id] = {
                        "page_name": page_name,
                        "total_weight": total_weight,
                        "total_count_word": total_count_word,
                    }

                    # Update words information
                    for word in words:
                        if word not in words_information:
                            words_information[word] = {"pages_list": [page_id]}
                        else:
                            words_information[word]["pages_list"].append(page_id)

    # Replace None with the actual count of occurrences
    for word, info in words_information.items():
        info["pages_list"] = len(info["pages_list"])

    return pages_information, words_information

# Function to print words information in table format
def print_words_information(words_information):
    print("\nWords Information:")
    print("{:<20} {}".format("Word", "Document IDs"))
    for word, info in words_information.items():
        print("{:<20} {}".format(word, info["pages_list"]))

# ... (rest of the code remains the same)

# Print words information
print_words_information(words_information)

# Function to perform query search
def query_search(user_query, words_information):
    query_words = user_query.lower().split()
    id_list = []

    for word in query_words:
        if word in words_information:
            id_list.extend(words_information[word]["pages_list"])

    return list(set(id_list))  # Remove duplicates

# Function to perform the modified GA procedure
def modified_genetic_algorithm(population_size, num_iterations):
    # Replace this with your actual genetic algorithm logic
    # Here, we'll just generate random fitness values for demonstration
    fitness_values = [random.random() for _ in range(population_size)]

    # Return the best fitness value after the specified number of iterations
    best_fitness = max(fitness_values)
    return best_fitness

# Example usage
if __name__ == "__main__":
    dataset_path = "/content/drive/MyDrive/Major_Project/Project_code/webkb-data/webkb"

    # Algorithm 1: Document Indexing
    pages_information, words_information = document_indexing(dataset_path)

    # Algorithm 2: Query Search
    user_query = "Required"
    relevant_page_ids = query_search(user_query, words_information)

    # Algorithm 3: Modified Genetic Algorithm
    population_size = 50
    num_iterations = 15
    best_fitness = modified_genetic_algorithm(population_size, num_iterations)

    # Print results
    print("Pages Information:")
    for page_id, page_info in pages_information.items():
        print(f"Page ID: {page_id}, Page Name: {page_info['page_name']}, Total Weight: {page_info['total_weight']}, Total Count Word: {page_info['total_count_word']}")

    print("\nWords Information:")
    for word, word_info in words_information.items():
        print(f"Word: {word}, Pages List: {word_info['pages_list']}")

    print("\nRelevant Page IDs for Query:", relevant_page_ids)
    print("Best Fitness Value:", best_fitness)




Words Information:
Word                 Document IDs
14                   [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
jan                  [None, None, None, None, None, None, None]
1997                 [None, None, None, None, None, None, None]
gmt                  [None, None, None, None, None, None, None, None, None, None, None, None, None, None]
15                   [None, None, None, None, None, None, None, None, None, None, None, None]
feb                  [None]
1996                 [None, None, None]
3082                 [None]
problem              [None, None, None, None, None]
1                    [None, None, None, None, None, None, None, None, None, None, None, None]
part                 [None, None, None, None, None, None, None, None]
a                    [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None

In [None]:
import os
import re
import random
import numpy as np

# Function to extract page ID from the filename
def get_page_id(file_name):
    match = re.search(r'page(\d+)', file_name)
    if match:
        return int(match.group(1))
    else:
        return None

# Function to extract page name from the filename
def get_page_name(file_name):
    # Extract page name from the filename by removing the file extension
    return os.path.splitext(file_name)[0]

# Function to preprocess the document content
def preprocess_document(document_content):
    # Tokenize the content by splitting on spaces (you can use more advanced tokenization)
    words = document_content.split()

    # Remove stop words, special characters, and sentence delimiters
    stop_words = set(["the", "and", "is", "in", "it", "an", "to", "of", "as", "by"])
    words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

    # Calculate total weight (word count) and total count word
    total_weight = len(words)
    total_count_word = len(set(words))

    return total_weight, total_count_word, words

# Function to perform document indexing
def document_indexing(dataset_path):
    pages_information = {}
    words_information = {}

    for root, _, files in os.walk(dataset_path):
        for file_name in files:
            if file_name.endswith(".txt"):
                with open(os.path.join(root, file_name), 'r', encoding='utf-8') as file:
                    document_content = file.read()
                    total_weight, total_count_word, words = preprocess_document(document_content)

                    page_id = get_page_id(file_name)
                    page_name = get_page_name(file_name)

                    # Store page information
                    pages_information[page_id] = {
                        "page_name": page_name,
                        "total_weight": total_weight,
                        "total_count_word": total_count_word,
                    }

                    # Update words information
                    for word in words:
                        if word not in words_information:
                            words_information[word] = {"pages_list": [page_id]}
                        else:
                            words_information[word]["pages_list"].append(page_id)

    return pages_information, words_information

# Function to perform query search
def query_search(user_query, words_information):
    query_words = user_query.lower().split()
    id_list = []

    for word in query_words:
        if word in words_information:
            id_list.extend(words_information[word]["pages_list"])
        else:
            print(f"Word '{word}' not found in indexed words.")

    return list(set(id_list))  # Remove duplicates


# Function to perform the modified GA procedure
def modified_genetic_algorithm(population_size, num_iterations):
    # Replace this with your actual genetic algorithm logic
    # Here, we'll just generate random fitness values for demonstration
    fitness_values = [random.random() for _ in range(population_size)]

    # Return the best fitness value after the specified number of iterations
    best_fitness = max(fitness_values)
    return best_fitness

# Function to print words information in a table format
def print_words_information(words_information):
    print("\nWords Information:")
    print("{:<20} {}".format("Word", "Page IDs"))

    for word, info in words_information.items():
        page_ids = ', '.join(map(str, info["pages_list"]))
        print("{:<20} {}".format(word, page_ids))

if __name__ == "__main__":
    dataset_path = "/content/drive/MyDrive/Major_Project/Project_code/webkb-data/webkb"

    # Algorithm 1: Document Indexing
    pages_information, words_information = document_indexing(dataset_path)

    # Algorithm 2: Query Search
    user_query = "ANALYSIS CRYPTOGRAPHIC PROTOCOLS"
    relevant_page_ids = query_search(user_query, words_information)

    # Algorithm 3: Modified Genetic Algorithm
    population_size = 50
    num_iterations = 20
    best_fitness = modified_genetic_algorithm(population_size, num_iterations)

    # Print results
    print("Pages Information:")
    for page_id, page_info in pages_information.items():
        print(f"Page ID: {page_id}, Page Name: {page_info['page_name']}, Total Weight: {page_info['total_weight']}, Total Count Word: {page_info['total_count_word']}")

    print_words_information(words_information)

    print("\nRelevant Page IDs for Query:", relevant_page_ids)
    print("Best Fitness Value:", best_fitness)


Word 'analysis' not found in indexed words.
Word 'cryptographic' not found in indexed words.
Word 'protocols' not found in indexed words.
Pages Information:
Page ID: None, Page Name: http_^^www.cs.clemson.edu^~madhu^usa, Total Weight: 1085, Total Count Word: 649

Words Information:
Word                 Page IDs
14                   None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
jan                  None, None, None, None, None, None, None
1997                 None, None, None, None, None, None, None
gmt                  None, None, None, None, None, None, None, None, None, None, None, None, None, None
15                   None, None, None, None, None, None, None, None, None, None, None, None
feb                  None
1996                 None, None, None
3082                 None
problem              None, None, None, None, None
1                    None, None, None, None, None, None, None, None, None, None, None, None
part               

In [None]:
import random

# Data Structures
pages_information = {}  # Dictionary to store page information
words_information = {}  # Dictionary to store word information
queries = []  # List to store user queries
relevant_documents = {}  # Dictionary to store relevant documents for queries

# Algorithm 1: Advanced Document Indexing Method
def process_document(document):
    # Extract page ID from the document's source code
    page_id_match = re.search(r'<id>(.*?)</id>', document)
    if page_id_match:
        page_id = int(page_id_match.group(1))
    else:
        page_id = 0  # Handle the case when no <id> tag is found

    # Extract page name from the document's source code
    page_name_match = re.search(r'<p-name>(.*?)</p-name>', document)
    if page_name_match:
        page_name = page_name_match.group(1)
    else:
        page_name = "Unknown Page"  # Handle the case when no <p-name> tag is found

    # Tokenize the content of the document
    content = re.sub(r'<.*?>', '', document)  # Remove HTML tags
    valid_tokens = re.findall(r'\b\w+\b', content)  # Tokenize based on word boundaries

    return page_id, page_name, valid_tokens

def advanced_document_indexing(dataset_path):
    # Initialize the data structures
    pages_information = {}
    words_information = {}

    # Replace this part with your logic to read and process documents
    # For demonstration, let's assume you have a list of documents
    documents = ["Document 1 content...", "Document 2 content...", "Document 3 content..."]

    for document in documents:
        # Process the document to extract page ID, page name, and valid tokens
        page_id, page_name, valid_tokens = process_document(document)

        # Update pages_information
        pages_information[page_id] = {
            'id': page_id,
            'p-name': page_name,
            'total-weight': 0,  # You can calculate the total weight here
            'total-count-word': len(valid_tokens)  # Total count of words
        }

        # Update words_information
        for token in valid_tokens:
            if token not in words_information:
                words_information[token] = {'pages-list': []}
            words_information[token]['pages-list'].append(page_id)

    # Now, you have updated pages_information and words_information
    return pages_information, words_information

# Usage example
dataset_path = "/content/drive/MyDrive/Major_Project/Project_code/webkb-data/webkb"
pages_information, words_information = advanced_document_indexing(dataset_path)





In [None]:
# Algorithm 2: Query Search Algorithm (QSA)
def tokenize_query(query):
    # Replace with logic to tokenize the user query
    return query.split()

def retrieve_documents(query_tokens):
    # Replace with logic to retrieve documents based on query tokens
    return [random.randint(1, 1000) for _ in range(10)]

def query_search(query):
    query_tokens = tokenize_query(query)
    id_list = retrieve_documents(query_tokens)
    return id_list



In [None]:
# Algorithm 3: Genetic Algorithm (GA)
def initialize_population(population_size):
    # Replace with logic to initialize the population
    return [random.randint(0, 1) for _ in range(population_size)]

def evaluate_fitness(population, relevant_documents):
    # Replace with logic to evaluate the fitness of each chromosome
    return [random.random() for _ in range(len(population))]

def select_population(population, fitness_values):
    # Replace with logic to select individuals for the next generation
    return population[:len(population)//2]

def create_offspring(selected_population):
    # Replace with logic to create offspring through crossover and mutation
    return selected_population

def select_best_solution(population):
    # Replace with logic to select the best solution from the population
    return population[0]

def genetic_algorithm(population_size, num_iterations, relevant_documents):
    population = initialize_population(population_size)
    for iteration in range(num_iterations):
        fitness_values = evaluate_fitness(population, relevant_documents)
        selected_population = select_population(population, fitness_values)
        offspring_population = create_offspring(selected_population)
        population = offspring_population
    best_solution = select_best_solution(population)
    return best_solution

In [None]:
# Algorithm 4: Integration between MGA and CA
def split_dataset(dataset_path):
    # Replace with logic to split the dataset into training and testing sets
    training_set = []
    testing_set = []
    return training_set, testing_set

def evaluate_solution(solution, testing_set):
    # Replace with logic to evaluate the solution and calculate accuracy measures
    accuracy = random.uniform(0.7, 0.95)
    recall = random.uniform(0.6, 0.98)
    precision = random.uniform(0.65, 0.99)
    f1 = random.uniform(0.6, 0.97)
    return accuracy, recall, precision, f1

def save_result(result):
    # Replace with logic to save the best result
    # Example: Save the result to a file or database
    with open("results.txt", "a") as file:
        file.write(f"Accuracy: {result['accuracy']}\n")
        file.write(f"Recall: {result['recall']}\n")
        file.write(f"Precision: {result['precision']}\n")
        file.write(f"F1 Score: {result['f1']}\n")

def display_best_result():
    # Replace with logic to display the best result
    # Example: Print the best result to the console
    print("Best Result:")
    print(f"Accuracy: {best_result['accuracy']}")
    print(f"Recall: {best_result['recall']}")
    print(f"Precision: {best_result['precision']}")
    print(f"F1 Score: {best_result['f1']}")

# Main Function
if __name__ == "__main__":
    dataset_path = "/content/drive/MyDrive/Major_Project/Project_code/webkb-data/webkb"
    num_iterations = 15
    threshold = 10
    accuracy_threshold = 0.9
    advanced_document_indexing(dataset_path)
    integration_algorithm(dataset_path, num_iterations, threshold, accuracy_threshold)

NameError: ignored

In [None]:
import os
import re
import random
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import chardet

# Function to extract page ID from the filename
def get_page_id(file_name):
    match = re.search(r'page(\d+)', file_name)
    if match:
        return int(match.group(1))
    else:
        return None

# Function to extract page name from the filename
def get_page_name(file_name):
    # Extract page name from the filename by removing the file extension
    return os.path.splitext(file_name)[0]

# Function to preprocess text
def preprocess_text(text):
    # Tokenize the content by splitting on spaces (you can use more advanced tokenization)
    words = text.split()

    # Remove stop words, special characters, and sentence delimiters
    stop_words = set(["the", "and", "is", "in", "it", "an", "to", "of", "as", "by"])
    words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

    return words

# Function to preprocess HTML content
def preprocess_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract information from the document
    page_title_tag = soup.title
    if page_title_tag and page_title_tag.string:
        page_name = page_title_tag.string.strip()  # Extract page name
    else:
        page_name = "N/A"  # Assign a default value if no title tag or title text is found

    # Tokenize and preprocess the content
    text_content = soup.get_text()
    valid_tokens = preprocess_text(text_content)

    return page_name, text_content, valid_tokens

# Function to perform document indexing for both text and HTML
def document_indexing(dataset_path):
    pages_information = {}
    words_information = {}

    for root, dirs, files in os.walk(dataset_path):
        for file_name in files:
            file_path = os.path.join(root, file_name)  # Get the full file path

            # Detect the encoding of the file
            with open(file_path, 'rb') as file:
                raw_data = file.read()
                encoding_info = chardet.detect(raw_data)
                encoding = encoding_info['encoding']

            # Read the content using the detected encoding
            with open(file_path, 'r', encoding=encoding) as file:
                document_content = file.read()

            if file_path.endswith(".html"):  # Check the file path, not the file object
                # Process HTML content
                page_name, text_content, valid_tokens = preprocess_html(document_content)
            else:
                # Process text content
                page_name = get_page_name(file_name)
                valid_tokens = preprocess_text(document_content)

            # Store page information
            page_id = get_page_id(file_name)
            pages_information[page_id] = {
                "page_name": page_name,
                "total_weight": len(valid_tokens),
                "total_count_word": len(valid_tokens),
            }

            # Update words information
            for word in valid_tokens:
                if word not in words_information:
                    words_information[word] = {"pages_list": [page_id]}
                else:
                    words_information[word]["pages_list"].append(page_id)

    # Replace None with the actual count of occurrences
    for word, info in words_information.items():
        info["pages_list"] = len(info["pages_list"])

    return pages_information, words_information

# Function to perform query search
def query_search(user_query, words_information):
    query_words = preprocess_text(user_query)
    id_list = []

    for word in query_words:
        if word in words_information:
            id_list.extend([word] * words_information[word]["pages_list"])  # Count word occurrences
        else:
            print(f"Word '{word}' not found in indexed words.")

    return id_list

def calculate_accuracy(retrieved_ids, relevant_ids):
    tp = len(set(retrieved_ids).intersection(relevant_ids))
    fp = len(retrieved_ids) - tp
    fn = len(relevant_ids) - tp

    accuracy = (tp + 0.0) / (tp + fp + fn) if (tp + fp + fn) != 0 else 0.0
    precision = (tp + 0.0) / (tp + fp) if (tp + fp) != 0 else 0.0
    recall = (tp + 0.0) / (tp + fn) if (tp + fn) != 0 else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0.0

    return accuracy, precision, recall, f1

# Function to plot a bar chart for accuracy measures
def plot_accuracy_bar_chart(accuracy, precision, recall, f1):
    labels = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
    values = [accuracy, precision, recall, f1]  # No need to convert to float

    plt.bar(labels, values)
    plt.title('Accuracy Measures')
    plt.ylim(0, 1.2)
    plt.show()

    labels = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
    values = [accuracy, precision, recall, f1]

    plt.bar(labels, values)
    plt.title('Accuracy Measures')
    plt.ylim(0, 1.2)
    plt.show()

# Function to print semantic results
def print_semantic_results(pages_information, retrieved_ids, relevant_ids):
    print("ID\tQuery\tSemantic Results\tRecall\tPrecision")
    for page_id, page_info in pages_information.items():
        semantic_results = f"{retrieved_ids.count(page_id)} of {relevant_ids.count(page_id)}"

        # Calculate recall for the page
        recall = page_info.get('recall', 0.0)  # Get recall value from the 'pages_information' dictionary

        precision = "{:.2%}".format(page_info.get('precision', 0.0))

        print(f"{page_id}\t{page_info['page_name']}\t{semantic_results}\t{recall:.2%}\t{precision}")


# Example usage
if __name__ == "__main__":
    dataset_path = "/content/drive/MyDrive/Major_Project/Project_code/webkb-data/webkb"  # Replace with the path to your dataset

    # Algorithm 1: Document Indexing
    pages_information, words_information = document_indexing(dataset_path)

    # Algorithm 2: Query Search
    user_query = "information Retrieval"
    relevant_ids = [1, 0, 1]  # Example relevant IDs (modify with your actual relevant IDs)
    retrieved_ids = query_search(user_query, words_information)

    # Algorithm 3: Modified Genetic Algorithm
    population_size = 50
    num_iterations = 10
    best_solution = modified_genetic_algorithm(population_size, num_iterations, relevant_ids)


    # Algorithm 4: Calculate Accuracy Measures
    accuracy, precision, recall, f1 = calculate_accuracy(retrieved_ids, relevant_ids)

    # Print semantic results
    print_semantic_results(pages_information, retrieved_ids, relevant_ids)

    # Print results
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)

    # Plot accuracy measures
    plot_accuracy_bar_chart(accuracy, precision, recall, f1)

  soup = BeautifulSoup(html_content, 'html.parser')


TypeError: ignored