In [None]:
from google.colab import drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
dataset_path = '/content/drive/MyDrive/webkb-data/webkb/course'
webkb_documents = os.listdir(dataset_path)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
def search_algorithm(user_query, inverted_index):
    query_words = user_query.split()

    # Initialize the result with the first word's inverted index
    result = set(inverted_index.get(query_words[0], []))

    # Step through each word in the user query and intersect their inverted indices
    for word in query_words[1:]:
        result = result.intersection(inverted_index.get(word, []))

    # Step 12: Return ID-List array
    return list(result)

# Example usage with inverted index
inverted_index = {
    "example": [1, 2, 3],
    "query": [2, 3, 4],
    "string": [1, 4, 5]
}

user_query = input("Enter your query: ")
result = search_algorithm(user_query, inverted_index)
print("Result:", result)

Enter your query: query
Result: [2, 3, 4]


In [None]:
import random

# Function to encode problem into a chromosome
def encode_problem(problem_words, word_index):
    chromosome = [0] * len(word_index)
    for word in problem_words:
        if word in word_index:
            chromosome[word_index[word]] = 1
    return chromosome

# Function to initialize population using inverted indexing
def initial_generation(population_size, problem_words, word_index):
    population = []
    for _ in range(population_size):
        chromosome = encode_problem(problem_words, word_index)
        population.append(chromosome)
    return population

# Function to evaluate fitness of a chromosome
def evaluate_fitness(chromosome, problem_words, word_index):
    fitness = 0
    for word in problem_words:
        if chromosome[word_index[word]] == 1:
            fitness += 1
    return fitness

# Function for selection (roulette wheel selection)
def selection(population, fitness):
    total_fitness = sum(fitness)
    probabilities = [fit / total_fitness for fit in fitness]
    selected_parents = random.choices(population, probabilities, k=len(population))
    return selected_parents

# Function for crossover (single-point crossover)
def crossover(parents):
    offspring = []
    for i in range(0, len(parents), 2):
        crossover_point = random.randint(1, len(parents[i]) - 1)
        child1 = parents[i][:crossover_point] + parents[i + 1][crossover_point:]
        child2 = parents[i + 1][:crossover_point] + parents[i][crossover_point:]
        offspring.extend([child1, child2])
    return offspring

# Function for mutation (bit flip mutation)
def mutation(offspring, mutation_rate):
    for i in range(len(offspring)):
        for j in range(len(offspring[i])):
            if random.random() < mutation_rate:
                offspring[i][j] = 1 - offspring[i][j]
    return offspring

# Function for the genetic algorithm
def genetic_algorithm(problem_words, population, generations, word_index, mutation_rate=0.1):
    for _ in range(generations):
        fitness = [evaluate_fitness(chromosome, problem_words, word_index) for chromosome in population]
        parents = selection(population, fitness)
        offspring = crossover(parents)
        offspring = mutation(offspring, mutation_rate)
        population = offspring
    best_chromosome = max(population, key=lambda x: evaluate_fitness(x, problem_words, word_index))
    return best_chromosome

# Example usage with inverted indexing
problem_words = ["example", "query", "string"]
word_index = {word: i for i, word in enumerate(problem_words)}
population_size = 100
generations = 100

initial_population = initial_generation(population_size, problem_words, word_index)
best_chromosome = genetic_algorithm(problem_words, initial_population, generations, word_index)
print("Best Chromosome:", best_chromosome)


Best Chromosome: [1, 1, 1]


In [None]:
import random

def generate_random_binary_sequence(length):
    return [random.choice([0, 1]) for _ in range(length)]

In [None]:
def build_inverted_index(documents):
    inverted_index = {}
    for doc_id, document in enumerate(documents):
        for word in document:
            if word not in inverted_index:
                inverted_index[word] = []
            inverted_index[word].append(doc_id)
    return inverted_index


In [None]:
import os
import random

# Step 1: Read Dataset
dataset_path = "/content/drive/MyDrive/webkb-data/webkb/course"
webkb_documents = os.listdir(dataset_path)

# Assign incremental IDs to documents
document_ids = {document: idx for idx, document in enumerate(webkb_documents)}

# Rest of your code remains the same, just replace doc_id generation with the assigned ID
# ...
# Step 2: Process each open document
for document in webkb_documents:
    doc_path = os.path.join(dataset_path, document)
    doc_id = document_ids[document]  # Assigned ID for the document

    # Rest of your code for processing the document goes here
    # ...


In [None]:
from bs4 import BeautifulSoup
from collections import defaultdict
import re

# Initialize the inverted index as a defaultdict of lists
inverted_index = defaultdict(list)

# Step 2: Process each folder
for folder_name in webkb_documents:
    folder_path = os.path.join(dataset_path, folder_name)
    doc_id = document_ids[folder_name]  # Assigned ID for the folder

    # Step 3: Extract information from HTML files inside the folder
    for html_file in os.listdir(folder_path):
        if html_file.endswith(".html"):
            html_file_path = os.path.join(folder_path, html_file)
            with open(html_file_path, "rb") as f:  # Open in binary mode
                html_content = f.read()

            # Decode the content using a specific encoding (e.g., ISO-8859-1)
            try:
                decoded_content = html_content.decode("iso-8859-1")
            except UnicodeDecodeError:
                print(f"Error decoding file: {html_file_path}")
                continue

            # Parse HTML content and extract text
            soup = BeautifulSoup(decoded_content, "html.parser")
            text_content = soup.get_text()

            # Tokenization and processing (you can customize this part based on your requirements)
            words = re.findall(r'\w+', text_content.lower())  # Simple tokenization

            # Update inverted index with document ID
            for word in words:
                # Check if word is special character or stop-word
                if word in ["special", "stop", "words"]:
                    continue

                # Update inverted index with document ID
                inverted_index[word].append(doc_id)

# Step 4: Display the inverted index
print("Inverted Index:")
for word, document_ids in inverted_index.items():
    print(f"Word: {word}, Document IDs: {document_ids}")

# Example search
search_word = "example"
if search_word in inverted_index:
    matching_documents = inverted_index[search_word]
    print(f"Documents containing '{search_word}': {matching_documents}")
else:
    print(f"No documents found containing '{search_word}'")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Word: eaton, Document IDs: [2, 2, 2, 2, 2, 2, 2, 2, 2]
Word: 518, Document IDs: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Word: 2860, Document IDs: [2]
Word: hollingd, Document IDs: [2]
Word: 204, Document IDs: [2, 2, 2, 3]
Word: 4857, Document IDs: [2]
Word: collinsr, Document IDs: [2]
Word: osi, Document IDs: [2]
Word: tftp, Document IDs: [2]
Word: 783, Document IDs: [2]
Word: 854, Document IDs: [2]
Word: 959, Document IDs: [2]
Word: relay, Document IDs: [2]
Word: chat, Document IDs: [2]
Word: xdr, Document IDs: [2]
Word: rpctalk, Document IDs: [2, 2]
Word: authoritative, Document IDs: [2]
Word: kerberos, Document IDs: [2, 2, 2]
Word: viewers, Document IDs: [2, 2]
Word: rfcs, Document IDs: [2, 2, 2]
Word: abuse, Document IDs: [2]
Word: 3014, Document IDs: [2]
Word: 438, Document IDs: [2, 2]
Word: spooner, Document IDs: [2, 2]
Word: unofficial, Document IDs: [2, 2]
Word: spoonerd, Document IDs: [2]
Word: 23349, Document IDs: [2]
Wo

In [None]:
# Sample inverted index
inverted_index = {
    "example": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4],
    "query": [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4],
    # Add more terms and corresponding document IDs to the inverted index
}

# Given ground_truth dictionary
ground_truth = {
    "example":
    # Add more search queries and corresponding correct document IDs as needed
}

def calculate_accuracy(ground_truth, inverted_index):
    total_correct_predictions = 0
    total_predictions = 0

    for query, correct_document_ids in ground_truth.items():
        if query in inverted_index:
            predicted_document_ids = inverted_index[query]

            # Count correct predictions for this query term
            correct_predictions = sum(1 for doc_id in predicted_document_ids if doc_id in correct_document_ids)

            total_correct_predictions += correct_predictions
            total_predictions += len(predicted_document_ids)

    # Calculate accuracy
    accuracy = (total_correct_predictions / total_predictions) * 100 if total_predictions > 0 else 0
    return accuracy

# Calculate accuracy
accuracy = calculate_accuracy(ground_truth, inverted_index)

# Print accuracy
print("Accuracy:", accuracy, "%")





Accuracy: 94.8905109489051 %
