In [None]:
!pip install requests beautifulsoup4



Code from HW2

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from collections import defaultdict
import requests
import re
import json
import os
import pandas as pd
import math
from urllib.parse import urljoin, urlparse
import time


# Ensure NLTK resources are downloaded
print("Downloading NLTK resources...")
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
print("NLTK resources downloaded.\n")

# Function to extract and normalize links from a webpage
def get_links(url):
    print(f"Extracting links from: {url}")
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        # Normalize links to absolute URLs
        normalized_links = []
        for link in links:
            if link.startswith('/'):
                # Convert relative URL to absolute
                normalized_link = urljoin(url, link)
                normalized_links.append(normalized_link)
            elif link.startswith('http'):
                normalized_links.append(link)
            # Ignore other types of links (e.g., mailto:, javascript:)
        print(f"Found {len(normalized_links)} normalized links.\n")
        return normalized_links
    except Exception as e:
        print(f"Error extracting links from {url}: {e}\n")
        return []

# Text preprocessing functions
from nltk.corpus import stopwords

def remove_stop_words(text, isAlphaChecking=True):
    stop_words = set(stopwords.words('english'))
    if isAlphaChecking:
        words = [word for word in text if word.lower() not in stop_words and word.isalpha()]
    else:
        words = [word for word in text if word.lower() not in stop_words]
    return words

def trim_words(words):
    cleaned_words = [word.rstrip(",.\\/?!'\"") for word in words]
    return cleaned_words

def apply_stemming(words):
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]
    return stemmed_words

def apply_lemmatization(words):
    print("CHECK!")
    print(words[:20])
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    print(lemmatized_words[:20])
    return lemmatized_words

# Function to create index and collect search results
def create_index(url, max_pages=250):
    print("Starting to create index and collect search results...\n")
    index = defaultdict(dict)  # word -> {doc_id: term_frequency}
    visited = set()
    queue = [url]
    results = []  # To store search results with relevant fields
    pages_crawled = 0
    link_ids = {}  # link -> unique ID

    while queue and pages_crawled < max_pages:
        current_url = queue.pop(0)

        # Normalize the URL to avoid duplicates (e.g., remove trailing slash)
        parsed_url = urlparse(current_url)
        normalized_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path.rstrip('/')

        if normalized_url in visited:
            print(f"Already visited: {normalized_url}\n")
            continue

        visited.add(normalized_url)
        pages_crawled += 1
        print(f"Crawling ({pages_crawled}/{max_pages}): {normalized_url}")

        try:
            session = requests.Session()
            session.max_redirects = 2
            response = session.get(normalized_url, allow_redirects=True, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract title
            title = soup.title.string.strip() if soup.title else 'No Title'

            # Extract full text
            text = soup.get_text(separator=' ', strip=True)
            snippet = text[:200] + '...' if len(text) > 200 else text

            # Append to results
            results.append({
                'URL': normalized_url,
                'Title': title,
                'Snippet': snippet,
                'FullText': text  # Store full text for TF-IDF
            })

            print(f"Title Extracted: {title}")
            print(f"Snippet Extracted: {snippet}\n")

            # Process words for indexing
            words = remove_stop_words(text.split())
            words = trim_words(words)
            words = apply_stemming(words)  # or apply_lemmatization(words) based on preference

            print(f"Number of words after preprocessing: {len(words)}")

            for word in words:
                # Assign a unique ID to each document
                if normalized_url not in link_ids:
                    link_ids[normalized_url] = pages_crawled  # Assigning ID based on crawl order
                doc_id = link_ids[normalized_url]

                # Update term frequency for the word in the document
                if doc_id in index[word]:
                    index[word][doc_id] += 1
                else:
                    index[word][doc_id] = 1

            print(f"Indexed {len(words)} words for document ID {doc_id}.\n")

            # Get and enqueue links
            links = get_links(normalized_url)
            for link in links:
                # Only enqueue links that start with the base URL and haven't been visited
                if link.startswith(url) and link not in visited and 'services/certifications' not in link:
                    queue.append(link)
            print(f"Enqueued {len(links)} new links.\n")
        except requests.Timeout:
            print(f"Timeout: {normalized_url}\n")
            continue
        except requests.RequestException as e:
            print(f"RequestException: {e} for URL: {normalized_url}\n")
            continue

    # Create reverse mapping from doc_id to URL
    doc_id_to_url = {v: k for k, v in link_ids.items()}

    print(f"Finished crawling. Total pages crawled: {pages_crawled}")
    print(f"Total unique words indexed: {len(index)}\n")
    return index, results, doc_id_to_url

# Function to rank words based on total frequency across all documents
def create_ranked_words(index):
    print("Ranking words based on total frequency across all documents...")
    sorted_dict = {}
    for word, doc_dict in index.items():
        total_count = sum(doc_dict.values())
        sorted_dict[word] = total_count
    # Sort the dictionary by counts in descending order
    sorted_dict = dict(sorted(sorted_dict.items(), key=lambda item: item[1], reverse=True))
    ranked_dict = {}
    rank = 1
    for word, counter in sorted_dict.items():
        ranked_dict[word] = {'rank': rank, 'counter': counter}
        rank += 1
    print("Word ranking completed.\n")
    return ranked_dict

# Function to export search results to Excel
def export_search_results(search_results, filename='search_results.xlsx'):
    print(f"Exporting search results to {filename}...")
    df = pd.DataFrame(search_results)
    try:
      df.to_excel(filename, index=False)
      print(f"Search results have been exported to {filename}\n")
    except Exception as e:
      pass

# Function to export inverted index to Excel
def export_inverted_index(index, chosen_words, filename='inverted_index.xlsx'):
    print(f"Building inverted index for the top {len(chosen_words)} words and exporting to {filename}...")
    inverted_index_data = []
    for word in chosen_words:
        doc_dict = index[word]
        # Limit to first 20 documents if necessary
        limited_docs = dict(sorted(doc_dict.items()))
        for doc_id, count in limited_docs.items():
            inverted_index_data.append({'Word': word, 'Document_ID': doc_id, 'Term_Frequency': count})
    df_inverted = pd.DataFrame(inverted_index_data)
    df_inverted.to_excel(filename, index=False)
    print(f"Inverted index has been exported to {filename}\n")

def get_words_per_url(doc_id_to_url):
    updated_doc_id_to_url = {}
    for doc_id, url in doc_id_to_url.items():
      try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            text = soup.get_text()
            word_count = len(text.split())
      except Exception as e:
            print(f"Error accessing {url}: {e}")
            word_count = 0
      updated_doc_id_to_url[doc_id] = {"url": url, "totalwords": word_count}
    return updated_doc_id_to_url

def create_term_doc_excel_for_all_terms(index, doc_id_to_url, file_path="term_doc_appearance_all_terms.xlsx"):
    # Extract all terms from the index
    all_terms = sorted(index.keys())

    # Extract all unique document IDs for consistent column ordering
    all_doc_ids = sorted({doc_id for term_docs in index.values() for doc_id in term_docs})

    # Map document IDs to URLs for column headers
    urls = [doc_id_to_url.get(doc_id, f"Doc {doc_id}") for doc_id in all_doc_ids]

    # Prepare data for the DataFrame
    data = {"term/doc": []}
    for url in urls:
        data[url] = []

    # Populate term frequencies for each term
    for term in all_terms:
        data["term/doc"].append(term)
        for doc_id in all_doc_ids:
            # Get the frequency of the term in the document, or 0 if not present
            data[doc_id_to_url.get(doc_id, f"Doc {doc_id}")].append(index[term].get(doc_id, 0))

    # Create a DataFrame
    df = pd.DataFrame(data)

    # Save DataFrame to Excel
    df.to_excel(file_path, index=False)
    return

def create_term_doc_excel_with_query(index, doc_id_to_url, query, file_path="term_doc_appearance.xlsx"):
    # Preprocess the query: remove stop words, apply stemming
    stop_words = set(stopwords.words("english"))
    stemmer = PorterStemmer()

    # Tokenize query, remove stop words, and apply stemming
    query_terms = query.split()
    processed_query_terms = [
        stemmer.stem(word.lower())
        for word in query_terms
        if word.lower() not in stop_words and word.isalpha()
    ]

    # Extract relevant terms (only those in the index)
    relevant_terms = [term for term in processed_query_terms if term in index]

    # Extract all unique document IDs for consistent column ordering
    all_doc_ids = sorted({doc_id for term_docs in index.values() for doc_id in term_docs})

    updated_doc_id_to_url = get_words_per_url(doc_id_to_url)
    # print(len(updated_doc_id_to_url))
    # print(updated_doc_id_to_url.get(1))
    # Map document IDs to URLs for column headers
    urls = [doc_id_to_url.get(doc_id, f"Doc {doc_id}") for doc_id in all_doc_ids]
    # print(urls)
    # Prepare data for the DataFrame
    data = {"term/doc": []}
    for url in urls:
        data[url] = []

    # Populate term frequencies for each relevant term
    for term in relevant_terms:
        data["term/doc"].append(term)
        for doc_id in all_doc_ids:
            data[doc_id_to_url.get(doc_id, f"Doc {doc_id}")].append(index[term].get(doc_id, 0))
            # print(data)

    # Create a DataFrame
    df = pd.DataFrame(data)

    # Save DataFrame to Excel
    df.to_excel(file_path, index=False)
    return

# Main Execution

# TODO:
#   remove 16 word limit for inverted index and calculate for all documents instead of 20
#   after retrieving the results and calculating the inverted index run each query against the results to get the search results.

if __name__ == "__main__":
    start = time.time()
    website_url = 'https://www.who.int'
    query = "'which medicine is used for covid 19'"
    # ============================
    # 1. Creating the Index and Collecting Results
    # ============================

    max_pages_to_crawl = 30
    index, search_results, doc_id_to_url = create_index(website_url, max_pages=max_pages_to_crawl)

    # Create ranked words
    ranked_words = create_ranked_words(index)
    chosen_words = list(ranked_words.keys())#[:15]   extract top 15 keywords
    print(f"Chosen Words for Inverted Index: {chosen_words}\n")

    # ============================
    # 2. Export Search Results to Excel
    # ============================

    export_search_results(search_results, filename='search_results.xlsx')

    # ============================
    # 3. Build and Export Inverted Index
    # ============================
    export_inverted_index(index, chosen_words, filename='inverted_index.xlsx')
    create_term_doc_excel_with_query(index, doc_id_to_url, query)
    create_term_doc_excel_for_all_terms(index, doc_id_to_url)
    end = time.time()
    print(end - start)


In [None]:
print()

crawling and indexing without Excel files

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from collections import defaultdict
import requests
import re
import json
import os
import pandas as pd
import math
from urllib.parse import urljoin, urlparse
import time

# Ensure NLTK resources are downloaded
print("Downloading NLTK resources...")
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
print("NLTK resources downloaded.\n")

from nltk.corpus import stopwords

# Text preprocessing functions
def remove_stop_words(text_tokens):
    """Remove standard English stopwords and keep only alphabetic tokens."""
    stop_words = set(stopwords.words('english'))
    words = [word for word in text_tokens if word.lower() not in stop_words and word.isalpha()]
    return words

def trim_words(words):
    """Trim punctuation from ends of words."""
    cleaned_words = [word.rstrip(",.\\/?!'\"") for word in words]
    return cleaned_words

def apply_stemming(words):
    """Apply Porter stemming to tokens."""
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word.lower()) for word in words]
    return stemmed_words

def apply_lemmatization(words):
    """Apply lemmatization to tokens."""
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in words]
    return lemmatized_words

def get_links(url):
    """
    Extract all absolute (normalized) links from a webpage.
    Only returns http/https links.
    """
    print(f"Extracting links from: {url}")
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [link.get('href') for link in soup.find_all('a', href=True)]

        normalized_links = []
        for link in links:
            if link.startswith('/'):
                # Convert relative URL to absolute
                normalized_link = urljoin(url, link)
                normalized_links.append(normalized_link)
            elif link.startswith('http'):
                normalized_links.append(link)
        print(f"Found {len(normalized_links)} normalized links.\n")
        return normalized_links
    except Exception as e:
        print(f"Error extracting links from {url}: {e}\n")
        return []

def create_index(start_url, max_pages=50):
    """
    Crawl up to `max_pages` starting from `start_url`.
    Build:
      - index: word -> {doc_id: term_frequency_in_doc}
      - results: list of metadata for each page (URL, Title, Snippet, FullText)
      - doc_id_to_url: map doc_id -> URL
      - doc_length: map doc_id -> total number of preprocessed words (for TF normalization)
    """

    # Terms to exclude from any URL
    excluded_terms = [
        "/mega-menu",
        "/services/certifications",
        "/login",
        "/subscribe",
        "/facebook",
        "/twitter",
        "/instagram",
        "/youtube",
        "/search?",
        "/draft",
        "/template",
        "/newsletters",
        "/maldives",
        "/footer",
        "/southeastasia",
        "/south-east-asia",
        "/europe",
        "/health-topics",
        "/about-us"
    ]


    index = defaultdict(dict)
    visited = set()
    queue = [start_url]
    results = []
    pages_crawled = 0
    link_ids = {}
    doc_length = {}

    while queue and pages_crawled < max_pages:
        current_url = queue.pop(0)

        # Normalize the URL
        parsed_url = urlparse(current_url)
        normalized_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path.rstrip('/')

        if normalized_url in visited:
            continue

        visited.add(normalized_url)
        pages_crawled += 1
        print(f"Crawling ({pages_crawled}/{max_pages}): {normalized_url}")

        try:
            session = requests.Session()
            session.max_redirects = 2
            response = session.get(normalized_url, allow_redirects=True, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract title
            title = soup.title.string.strip() if soup.title else 'No Title'

            # Extract text for indexing
            text = soup.get_text(separator=' ', strip=True)
            snippet = text[:200] + '...' if len(text) > 200 else text

            # Assign doc_id for this page
            doc_id = pages_crawled
            link_ids[normalized_url] = doc_id

            # Store metadata
            results.append({
                'URL': normalized_url,
                'Title': title,
                'Snippet': snippet,
                'FullText': text
            })

            print(f"Title Extracted: {title}")
            print(f"Snippet Extracted: {snippet[:100]}\n")

            # Preprocess tokens (stopword removal, stemming, etc.)
            raw_tokens = text.split()
            cleaned_tokens = remove_stop_words(raw_tokens)
            cleaned_tokens = trim_words(cleaned_tokens)
            cleaned_tokens = apply_stemming(cleaned_tokens)

            doc_length[doc_id] = len(cleaned_tokens)
            print(f"Number of tokens after preprocessing: {doc_length[doc_id]}")

            # Update inverted index
            for token in cleaned_tokens:
                index[token].setdefault(doc_id, 0)
                index[token][doc_id] += 1

            print(f"Indexed {len(cleaned_tokens)} tokens for document ID {doc_id}.\n")

            # Extract links and enqueue children
            extracted_links = get_links(normalized_url)
            for link in extracted_links:
                # Skip excluded terms
                if any(excluded in link for excluded in excluded_terms):
                    continue

                # You can also restrict to the base domain if desired:
                if link.startswith(start_url) and link not in visited:
                    queue.append(link)

        except requests.Timeout:
            print(f"Timeout: {normalized_url}\n")
            continue
        except requests.RequestException as e:
            print(f"RequestException: {e} for URL: {normalized_url}\n")
            continue

    doc_id_to_url = {v: k for k, v in link_ids.items()}

    print(f"Finished crawling. Total pages crawled: {pages_crawled}")
    print(f"Total unique words indexed: {len(index)}\n")

    return index, results, doc_id_to_url, doc_length


def compute_idf(index, total_docs):
    """
    Compute Inverse Document Frequency for each term in the index.
    IDF(term) = log( total_docs / (df + 1) ) + 1
      - 'df' is the number of docs containing the term
    Returns a dictionary: { term: idf_value }
    """
    idf_dict = {}
    for term, posting_dict in index.items():
        df = len(posting_dict)  # number of docs that contain this term
        # Add 1 to avoid division by zero if df=0
        idf_value = math.log((total_docs / (df + 1)), 10) + 1
        idf_dict[term] = idf_value
    return idf_dict


def search(query, index, idf_dict, doc_length, doc_id_to_url, search_results, top_k=10):
    """
    Given a query string, rank documents by TF-IDF score.
    Return the top_k results as a list of dicts: { 'doc_id', 'score', 'URL', 'Title', 'Snippet' }.
    """

    # --- Preprocess the query ---
    stop_words = set(stopwords.words("english"))
    stemmer = PorterStemmer()

    raw_query_tokens = query.split()
    query_tokens = [
        stemmer.stem(w.lower()) for w in raw_query_tokens
        if w.lower() not in stop_words and w.isalpha()
    ]

    if not query_tokens:
        print("No valid tokens in the query after preprocessing.")
        return []

    # --- Accumulate scores ---
    scores = defaultdict(float)  # doc_id -> score

    # For each token in the query, update the TF-IDF score for each doc that has it
    for token in query_tokens:
        if token not in index:
            continue  # Token not in any doc
        posting_list = index[token]
        for doc_id, freq in posting_list.items():
            # TF = frequency_in_doc / doc_length
            tf = freq / doc_length[doc_id]
            # IDF for the token
            idf = idf_dict[token]
            # Accumulate
            scores[doc_id] += (tf * idf)

    # --- Sort docs by score descending ---
    # We only want docs that have a nonzero score
    ranked_doc_ids = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    # --- Build final results ---
    # Convert doc_id -> metadata (title, snippet, etc.)
    result_docs = []
    for doc_id, score in ranked_doc_ids[:top_k]:
        # Find the corresponding entry in search_results
        # or build from doc_id_to_url
        url = doc_id_to_url[doc_id]
        # Attempt to get the snippet/title from search_results
        # We can do a quick lookup:
        metadata = next((d for d in search_results if d['URL'] == url), None)
        if metadata:
            result_docs.append({
                'doc_id': doc_id,
                'score': score,
                'URL': url,
                'Title': metadata['Title'],
                'Snippet': metadata['Snippet'],
            })
        else:
            result_docs.append({
                'doc_id': doc_id,
                'score': score,
                'URL': url,
                'Title': "No Title",
                'Snippet': "",
            })

    return result_docs

def create_ranked_words(index):
    """
    Return a dict sorted by total frequency across all docs
    word -> { 'rank': rank, 'counter': total_frequency_across_docs }
    """
    print("Ranking words based on total frequency across all documents...")
    sorted_dict = {}
    for word, doc_dict in index.items():
        total_count = sum(doc_dict.values())
        sorted_dict[word] = total_count

    sorted_dict = dict(sorted(sorted_dict.items(), key=lambda item: item[1], reverse=True))

    ranked_dict = {}
    rank = 1
    for word, counter in sorted_dict.items():
        ranked_dict[word] = {'rank': rank, 'counter': counter}
        rank += 1

    print("Word ranking completed.\n")
    return ranked_dict

def export_search_results(search_results, filename='search_results.xlsx'):
    print(f"Exporting search results to {filename}...")
    df = pd.DataFrame(search_results)
    try:
        df.to_excel(filename, index=False)
        print(f"Search results have been exported to {filename}\n")
    except Exception as e:
        print(f"Error exporting search results: {e}")

def export_inverted_index(index, chosen_words, filename='inverted_index.xlsx'):
    print(f"Building inverted index for the top {len(chosen_words)} words and exporting to {filename}...")
    inverted_index_data = []
    for word in chosen_words:
        doc_dict = index[word]
        # For large corpora, you might limit docs displayed
        for doc_id, count in doc_dict.items():
            inverted_index_data.append({'Word': word, 'Document_ID': doc_id, 'Term_Frequency': count})
    df_inverted = pd.DataFrame(inverted_index_data)
    try:
        df_inverted.to_excel(filename, index=False)
        print(f"Inverted index has been exported to {filename}\n")
    except Exception as e:
        print(f"Error exporting inverted index: {e}")

# Optional helpers to export term-doc matrices, etc.
def create_term_doc_excel_with_query(index, doc_id_to_url, query, file_path="term_doc_appearance_query.xlsx"):
    """
    Creates an Excel file listing term frequencies in each doc for terms in the given query.
    Rows = query terms
    Columns = document URLs
    """

    stop_words = set(nltk.corpus.stopwords.words("english"))
    stemmer = PorterStemmer()

    # --- 1. Preprocess the query ---
    raw_query_tokens = query.split()
    processed_query_terms = [
        stemmer.stem(word.lower())
        for word in raw_query_tokens
        if word.lower() not in stop_words and word.isalpha()
    ]

    # Keep only the terms that actually appear in our index
    relevant_terms = [term for term in processed_query_terms if term in index]
    if not relevant_terms:
        print("No relevant terms from the query found in the index.")
        return

    # --- 2. Identify all doc IDs (so we can create columns) ---
    all_doc_ids = sorted({doc_id for posting in index.values() for doc_id in posting})

    # --- 3. Prepare data dictionary for DataFrame ---
    # We'll have one row per query term, plus one column "Term"
    data = {"Term": relevant_terms}

    # Create an empty list for each doc's column
    for doc_id in all_doc_ids:
        doc_url = doc_id_to_url.get(doc_id, f"Doc {doc_id}")
        data[doc_url] = []  # Each column is initially empty

    # --- 4. Fill in frequencies for each (term, doc) pair ---
    for term in relevant_terms:
        for doc_id in all_doc_ids:
            freq = index[term].get(doc_id, 0)
            doc_url = doc_id_to_url.get(doc_id, f"Doc {doc_id}")
            data[doc_url].append(freq)

    # --- 5. Create and export DataFrame ---
    df = pd.DataFrame(data)
    try:
        df.to_excel(file_path, index=False)
        print(f"Term-doc appearance for the query saved to '{file_path}'.")
    except Exception as e:
        print(f"Error saving term-doc appearance: {e}")

def create_term_doc_excel_for_all_terms(index, doc_id_to_url, file_path="term_doc_appearance_all_terms.xlsx"):
    """Creates an Excel file listing term frequencies for all terms in each doc."""
    all_terms = sorted(index.keys())
    all_doc_ids = sorted({doc_id for term_docs in index.values() for doc_id in term_docs})
    data = {"term/doc": []}
    urls = [doc_id_to_url.get(doc_id, f"Doc {doc_id}") for doc_id in all_doc_ids]

    for url in urls:
        data[url] = []

    for term in all_terms:
        data["term/doc"].append(term)
        for doc_id_idx, doc_id in enumerate(all_doc_ids):
            col_url = urls[doc_id_idx]
            tf = index[term].get(doc_id, 0)
            data[col_url].append(tf)

    df = pd.DataFrame(data)
    try:
        df.to_excel(file_path, index=False)
        print(f"All term-doc appearances saved to {file_path}")
    except Exception as e:
        print(f"Error saving all terms doc matrix: {e}")


def pretty_print_results(query, results):
    """
    Display search results in a neat table using pandas.
    Each row: Rank, Score, Title, URL, Snippet
    """
    # Prepare a list of dict rows to feed into a DataFrame
    rows = []
    for rank, item in enumerate(results, start=1):
        rows.append({
            'Rank': rank,
            'Score': round(item['score'], 4),
            'Title': item['Title'],
            'URL': item['URL'],
            'Snippet': item['Snippet'][:50] + "..."  # Just show first ~50 chars
        })
    # Create DataFrame
    df = pd.DataFrame(rows, columns=['Rank', 'Score', 'Title', 'URL', 'Snippet'])
    print(f"\n===== Results for Query: \"{query}\" =====\n")
    print(df.to_string(index=False))
    print("\n")

# =========================
# Main Execution Example
# =========================
if __name__ == "__main__":
    start_time = time.time()

    # 1. Crawl & build index
    website_url = 'https://www.who.int'
    max_pages_to_crawl = 3000
    index, search_results, doc_id_to_url, doc_length = create_index(website_url, max_pages=max_pages_to_crawl)

    # 2. Compute IDF for the corpus
    total_docs = len(doc_id_to_url)
    idf_dict = compute_idf(index, total_docs)

    # 3. Export search results
    export_search_results(search_results, filename='search_results.xlsx')

    # 4. Create a word frequency ranking
    ranked_words = create_ranked_words(index)
    # Let's pick some subset if you want
    chosen_words = list(ranked_words.keys())[:15]
    export_inverted_index(index, chosen_words, filename='inverted_index.xlsx')

    # Example queries
    queries = [
        'Types of COVID-19 Treatment',
        "guidelines on antibiotic resistance",
        "mental health resources during emergencies"
    ]

    for q in queries:
        # top_k can be adjusted to show however many results you want
        top_k = 20
        results_for_query = search(
            q,
            index,
            idf_dict,
            doc_length,
            doc_id_to_url,
            search_results,
            top_k=top_k
        )
        pretty_print_results(q, results_for_query)

    # [Optional] Export full term-doc matrix
    create_term_doc_excel_for_all_terms(index, doc_id_to_url, file_path="term_doc_appearance_all_terms.xlsx")

    end_time = time.time()
    print(f"Done! Elapsed time: {end_time - start_time:.2f} seconds.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
%����
1 0 obj
< >/Metadata 2293 0 R/ViewerPreferences 2294 0 R>>
endobj
2 0 obj
< >


Number of tokens after preprocessing: 3058
Indexed 3058 tokens for document ID 2444.

Extracting links from: https://www.who.int/docs/default-source/coronaviruse/situation-reports/20200303-sitrep-43-covid-19.pdf
Found 0 normalized links.

Crawling (2445/3000): https://www.who.int/docs/default-source/coronaviruse/situation-reports/20200302-sitrep-42-covid-19.pdf
Title Extracted: No Title
Snippet Extracted: %PDF-1.7
%����
1 0 obj
< >/Metadata 2149 0 R/ViewerPreferences 2150 0 R>>
endobj
2 0 obj
< >


Number of tokens after preprocessing: 2812
Indexed 2812 tokens for document ID 2445.

Extracting links from: https://www.who.int/docs/default-source/coronaviruse/situation-reports/20200302-sitrep-42-covid-19.pdf
Found 0 normalized links.

Crawling (2446/3000): https://www.who.int/docs/default-source/coronaviruse/situation-reports/20200301-sitr

Algorithm for checking which retrieved pages are linked

Query 2

In [None]:
import requests
from bs4 import BeautifulSoup

ret_urls = [
    "https://www.who.int/news-room/fact-sheets/detail/hiv-drug-resistance",
    "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/science-in-5/episode-11---antibiotics-covid-19",
    "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/science-in-5/microbes-are-becoming-resistant-to-antibiotics",
    "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/science-in-5/episode-61---covid-19-antibiotics",
    "https://www.who.int/westernpacific/newsroom/events/world-antibiotic-awareness-week",
    "https://www.who.int/publications/who-guidelines",
    "https://www.who.int/news-room/fact-sheets/detail/multi-drug-resistant-gonorrhoea",
    "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/science-in-5/episode-99-three-things-to-keep-in-mind-when-taking-antibiotics",
    "https://www.who.int/publications/i/item/guidelines-for-malaria",
    "https://www.who.int/campaigns/world-amr-awareness-week",
    "https://www.who.int/westernpacific/activities/tackling-antimicrobial-resistance",
    "https://www.who.int/guam/news/feature-stories/item/combating-antimicrobial-resistance-in-the-pacific",
    "https://www.who.int/americansamoa/news/feature-stories/item/combating-antimicrobial-resistance-in-the-pacific",
    "https://www.who.int/tonga/news/feature-stories/item/combating-antimicrobial-resistance-in-the-pacific",
    "https://www.who.int/vanuatu/news/feature-stories/item/combating-antimicrobial-resistance-in-the-pacific",
    "https://www.who.int/wallisandfutuna/news/feature-stories/item/combating-antimicrobial-resistance-in-the-pacific",
    "https://www.who.int/news-room/fact-sheets/detail/antibiotic-resistance",
    "https://www.who.int/news-room/fact-sheets/detail/antimicrobial-resistance",
    "https://www.who.int/kiribati/news/feature-stories/item/combating-antimicrobial-resistance-in-the-pacific",
    "https://www.who.int/tuvalu/news/feature-stories/item/combating-antimicrobial-resistance-in-the-pacific"
]

for url in ret_urls:

  # URL of the webpage
  current_url = url

  # Send a request to the webpage
  response = requests.get(current_url)

  # Parse the HTML
  soup = BeautifulSoup(response.text, "html.parser")

  # Find all <a> tags
  links = soup.find_all("a")

  # Extract and print the href attributes
  for link in links:
      href = link.get("href")
      if href and "https" in href and href in ret_urls:
          print(current_url + " ---> " + href)







https://www.who.int/news-room/fact-sheets/detail/hiv-drug-resistance ---> https://www.who.int/news-room/fact-sheets/detail/antimicrobial-resistance
https://www.who.int/publications/who-guidelines ---> https://www.who.int/publications/i/item/guidelines-for-malaria
https://www.who.int/publications/who-guidelines ---> https://www.who.int/publications/i/item/guidelines-for-malaria
https://www.who.int/news-room/fact-sheets/detail/multi-drug-resistant-gonorrhoea ---> https://www.who.int/news-room/fact-sheets/detail/antimicrobial-resistance
https://www.who.int/campaigns/world-amr-awareness-week ---> https://www.who.int/news-room/fact-sheets/detail/antimicrobial-resistance


In [None]:
import requests
from bs4 import BeautifulSoup

ret_urls = [
    "https://www.who.int/news-room/fact-sheets/detail/hiv-drug-resistance",
    "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/science-in-5/episode-11---antibiotics-covid-19",
    "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/science-in-5/microbes-are-becoming-resistant-to-antibiotics",
    "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/science-in-5/episode-61---covid-19-antibiotics",
    "https://www.who.int/westernpacific/newsroom/events/world-antibiotic-awareness-week",
    "https://www.who.int/publications/who-guidelines",
    "https://www.who.int/news-room/fact-sheets/detail/multi-drug-resistant-gonorrhoea",
    "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/science-in-5/episode-99-three-things-to-keep-in-mind-when-taking-antibiotics",
    "https://www.who.int/publications/i/item/guidelines-for-malaria",
    "https://www.who.int/campaigns/world-amr-awareness-week",
    "https://www.who.int/westernpacific/activities/tackling-antimicrobial-resistance",
    "https://www.who.int/guam/news/feature-stories/item/combating-antimicrobial-resistance-in-the-pacific",
    "https://www.who.int/americansamoa/news/feature-stories/item/combating-antimicrobial-resistance-in-the-pacific",
    "https://www.who.int/tonga/news/feature-stories/item/combating-antimicrobial-resistance-in-the-pacific",
    "https://www.who.int/vanuatu/news/feature-stories/item/combating-antimicrobial-resistance-in-the-pacific",
    "https://www.who.int/wallisandfutuna/news/feature-stories/item/combating-antimicrobial-resistance-in-the-pacific",
    "https://www.who.int/news-room/fact-sheets/detail/antibiotic-resistance",
    "https://www.who.int/news-room/fact-sheets/detail/antimicrobial-resistance",
    "https://www.who.int/kiribati/news/feature-stories/item/combating-antimicrobial-resistance-in-the-pacific",
    "https://www.who.int/tuvalu/news/feature-stories/item/combating-antimicrobial-resistance-in-the-pacific"
]

for url in ret_urls:

  # URL of the webpage
  current_url = url

  # Send a request to the webpage
  response = requests.get(current_url)

  # Parse the HTML
  soup = BeautifulSoup(response.text, "html.parser")

  # Find all <a> tags
  links = soup.find_all("a")

  # Extract and print the href attributes
  for link in links:
      href = link.get("href")
      if href and "https" in href and href in ret_urls:
          print(current_url + " : " + "[" + href + "]")







https://www.who.int/news-room/fact-sheets/detail/hiv-drug-resistance : [https://www.who.int/news-room/fact-sheets/detail/antimicrobial-resistance]
https://www.who.int/publications/who-guidelines : [https://www.who.int/publications/i/item/guidelines-for-malaria]
https://www.who.int/publications/who-guidelines : [https://www.who.int/publications/i/item/guidelines-for-malaria]
https://www.who.int/news-room/fact-sheets/detail/multi-drug-resistant-gonorrhoea : [https://www.who.int/news-room/fact-sheets/detail/antimicrobial-resistance]
https://www.who.int/campaigns/world-amr-awareness-week : [https://www.who.int/news-room/fact-sheets/detail/antimicrobial-resistance]


Query 1

In [None]:
import requests
from bs4 import BeautifulSoup

ret_urls = [
    "https://www.who.int/news-room/fact-sheets/detail/diabetes",
    "https://www.who.int/news-room/fact-sheets/detail/lung-cancer",
    "https://www.who.int/news-room/fact-sheets/detail/schistosomiasis",
    "https://www.who.int/news-room/fact-sheets/detail/hepatitis-c",
    "https://www.who.int/news-room/fact-sheets/detail/depression",
    "https://www.who.int/news-room/fact-sheets/detail/human-papilloma-virus-and-cancer",
    "https://www.who.int/news-room/fact-sheets/detail/cancer",
    "https://www.who.int/westernpacific/activities/improving-access-to-prevention-testing-and-treatment-for-hepatitis",
    "https://www.who.int/news-room/fact-sheets/detail/colorectal-cancer",
    "https://www.who.int/publications/i/item/9789240096745",
    "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/science-in-5/episode-46---diabetes-covid-19",
    "https://www.who.int/news-room/fact-sheets/detail/endometriosis",
    "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/science-in-5/episode-68-covid-19-update-on-long-covid",
    "https://www.who.int/news-room/fact-sheets/detail/mycetoma",
    "https://www.who.int/news-room/fact-sheets/detail/tuberculosis",
    "https://www.who.int/news-room/fact-sheets/detail/leprosy",
    "https://www.who.int/news/item/08-01-2025-who-prequalifies-diagnostic-test-to-support-safer-administration-of-p.-vivax-malaria-treatments",
    "https://www.who.int/news-room/fact-sheets/detail/ebola-virus-disease",
    "https://www.who.int/news/item/23-04-2024-promising-patient-friendly-oral-drug-against-visceral-leishmaniasis-enters-phase-ii-clinical-trial-in-ethiopia",
    "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/science-in-5/episode-37---treatment-and-care-at-home"
]

for url in ret_urls:

  # URL of the webpage
  current_url = url

  # Send a request to the webpage
  response = requests.get(current_url)

  # Parse the HTML
  soup = BeautifulSoup(response.text, "html.parser")

  # Find all <a> tags
  links = soup.find_all("a")

  # Extract and print the href attributes
  for link in links:
      href = link.get("href")
      if href and "https" in href and href in ret_urls:
          print(current_url + " ---> " + href)

https://www.who.int/news-room/fact-sheets/detail/lung-cancer ---> https://www.who.int/news-room/fact-sheets/detail/cancer
https://www.who.int/news-room/fact-sheets/detail/cancer ---> https://www.who.int/news-room/fact-sheets/detail/colorectal-cancer
https://www.who.int/news-room/fact-sheets/detail/cancer ---> https://www.who.int/news-room/fact-sheets/detail/lung-cancer
https://www.who.int/news-room/fact-sheets/detail/colorectal-cancer ---> https://www.who.int/news-room/fact-sheets/detail/cancer
https://www.who.int/news-room/fact-sheets/detail/colorectal-cancer ---> https://www.who.int/news-room/fact-sheets/detail/lung-cancer
https://www.who.int/news-room/fact-sheets/detail/mycetoma ---> https://www.who.int/news-room/fact-sheets/detail/leprosy
https://www.who.int/news-room/fact-sheets/detail/leprosy ---> https://www.who.int/news-room/fact-sheets/detail/mycetoma


**Query** 2

In [None]:
import requests
from bs4 import BeautifulSoup

ret_urls = [
    "https://www.who.int/teams/health-workforce/PHEworkforce",
    "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/science-in-5/hepatitis-outbreak-in-children",
    "https://www.who.int/publications/b",
    "https://www.who.int/malaysia/emergencies/covid-19-in-malaysia/information",
    "https://www.who.int/publications/i/item/9789240090743",
    "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/science-in-5/episode-71---covid-19-vaccines-and-children",
    "https://www.who.int/emergencies/operations",
    "https://www.who.int/activities/measuring-the-effectiveness-and-impact-of-public-health-and-social-measures",
    "https://www.who.int/westernpacific/activities/communicating-risk-in-public-health-emergencies",
    "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/science-in-5/public-health-emergency-of-international-concern",
    "https://www.who.int/mongolia/multi-media/item/live-life-safely--trains-and-public-transport",
    "https://www.who.int/westernpacific/our-work/resources/publications",
    "https://www.who.int/westernpacific/publications",
    "https://www.who.int/westernpacific/activities/detecting-and-assessing-emergency-health-threats",
    "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/media-resources/science-in-5/episode-43---pregnancy-covid-19",
    "https://www.who.int/publications/i/9789240081277",
    "https://www.who.int/about/policies/publishing/open-access",
    "https://www.who.int/westernpacific/publications/i",
    "https://www.who.int/americansamoa/publications",
    "https://www.who.int/brunei/publications-hub"
]

for url in ret_urls:

  # URL of the webpage
  current_url = url

  # Send a request to the webpage
  response = requests.get(current_url)

  # Parse the HTML
  soup = BeautifulSoup(response.text, "html.parser")

  # Find all <a> tags
  links = soup.find_all("a")

  # Extract and print the href attributes
  for link in links:
      href = link.get("href")
      if href and "https" in href and href in ret_urls:
          print(current_url + " ---> " + href)

https://www.who.int/westernpacific/activities/communicating-risk-in-public-health-emergencies ---> https://www.who.int/westernpacific/activities/communicating-risk-in-public-health-emergencies


Query 3

In [None]:
import requests
from bs4 import BeautifulSoup

ret_urls = [
    "https://www.who.int/campaigns/world-mental-health-day",
    "https://www.who.int/news-room/fact-sheets/detail/mental-health-strengthening-our-response",
    "https://www.who.int/news-room/fact-sheets/detail/mental-health-at-work",
    "https://www.who.int/news-room/fact-sheets/detail/mental-health-in-emergencies",
    "https://www.who.int/westernpacific/activities/promoting-mental-health",
    "https://www.who.int/china/news/feature-stories/detail/covid-19-sounding-the-alarm-again-that-mental-health-intervention-is-an-indispensable-health-service",
    "https://www.who.int/pitcairnislands/emergencies",
    "https://www.who.int/tokelau/emergencies",
    "https://www.who.int/westernpacific/activities/protecting-mental-health-during-emergencies",
    "https://www.who.int/emergencies/operations",
    "https://www.who.int/westernpacific/activities/strengthening-government-action-on-mental-health",
    "https://www.who.int/westernpacific/activities/developing-community-based-mental-health-services",
    "https://www.who.int/fiji/publications-detail/RS-2003-GE-44-TON",
    "https://www.who.int/teams/health-workforce/PHEworkforce",
    "https://www.who.int/westernpacific/about/how-we-work/programmes/who-health-emergencies-programme",
    "https://www.who.int/news-room/fact-sheets/detail/occupational-health--health-workers",
    "https://www.who.int/news-room/fact-sheets/detail/mental-health-and-forced-displacement",
    "https://www.who.int/news-room/fact-sheets/detail/health-literacy",
    "https://www.who.int/malaysia/news/detail/12-09-2024-singapore-contributes-to-regional-health-emergency-readiness-through-achieving-emergency-medical-team-classification",
    "https://www.who.int/singapore/news/detail/12-09-2024-singapore-contributes-to-regional-health-emergency-readiness-through-achieving-emergency-medical-team-classification"
]
for url in ret_urls:
  # URL of the webpage
  current_url = url
  # Send a request to the webpage
  response = requests.get(current_url)
  # Parse the HTML
  soup = BeautifulSoup(response.text, "html.parser")
  # Find all <a> tags
  links = soup.find_all("a")
  # Extract and print the href attributes
  for link in links:
      href = link.get("href")
      if href and "https" in href and href in ret_urls:
        print(current_url + " : " + "[" + href + "]")

https://www.who.int/campaigns/world-mental-health-day : [https://www.who.int/news-room/fact-sheets/detail/mental-health-strengthening-our-response]
https://www.who.int/campaigns/world-mental-health-day : [https://www.who.int/news-room/fact-sheets/detail/mental-health-in-emergencies]
https://www.who.int/news-room/fact-sheets/detail/mental-health-strengthening-our-response : [https://www.who.int/news-room/fact-sheets/detail/mental-health-and-forced-displacement]
https://www.who.int/news-room/fact-sheets/detail/mental-health-strengthening-our-response : [https://www.who.int/news-room/fact-sheets/detail/mental-health-in-emergencies]
https://www.who.int/news-room/fact-sheets/detail/mental-health-at-work : [https://www.who.int/news-room/fact-sheets/detail/mental-health-strengthening-our-response]
https://www.who.int/news-room/fact-sheets/detail/mental-health-at-work : [https://www.who.int/news-room/fact-sheets/detail/occupational-health--health-workers]
https://www.who.int/news-room/fact-she

In [None]:
def calculate_new_pagerank(current_ranks, links):
    new_ranks = {}

    # Calculate new PageRank for each page
    for page in current_ranks:
        # Find who links to this page
        incoming_links = [p for p, outgoing in links.items() if page in outgoing]

        # Sum up PageRank contributions
        rank_sum = 0
        for source_page in incoming_links:
            # Get number of outgoing links from source page
            num_outgoing = len(links[source_page])
            # Add contribution from this source page
            rank_sum += current_ranks[source_page] / num_outgoing

        new_ranks[page] = rank_sum

    return new_ranks

def print_ranks(ranks, iteration):
    print(f"\nPageRank values after iteration {iteration}:")
    print("-" * 35)
    print("Page  |  PageRank Value")
    print("-" * 35)
    for page, rank in ranks.items():
        print(f"  {page}   |     {rank:.3f}")
    print("-" * 35)

# Define the web structure
A = 'https://www.who.int/news-room/fact-sheets/detail/hiv-drug-resistance'
B = 'https://www.who.int/news-room/fact-sheets/detail/antimicrobial-resistance'
C = 'https://www.who.int/publications/who-guidelines'
D = 'https://www.who.int/publications/i/item/guidelines-for-malaria'
E = 'https://www.who.int/news-room/fact-sheets/detail/multi-drug-resistant-gonorrhoea'
F = 'https://www.who.int/news-room/fact-sheets/detail/antimicrobial-resistance'
G = 'https://www.who.int/campaigns/world-mental-health-day'
H = 'https://www.who.int/news-room/fact-sheets/detail/mental-health-strengthening-our-response'
I = 'https://www.who.int/news-room/fact-sheets/detail/mental-health-in-emergencies'
J = 'https://www.who.int/westernpacific/activities/promoting-mental-health'
links = {
    'A' : ['B'],
    'C' : ['D'],
    'E' : ['F'],
    'G' : ['H', 'I'],
    'J' : ['G']

}

# Initialize PageRank values (1/10 for each page)
pages = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
current_ranks = {page: 1/10 for page in pages}

print("Web structure:")
for page, outlinks in links.items():
    print(f"Page {page} links to: {', '.join(outlinks)}")

# Print initial values
print("\nInitial PageRank Values:")
print_ranks(current_ranks, 0)

# First iteration
first_iteration = calculate_new_pagerank(current_ranks, links)
print_ranks(first_iteration, 1)

# Second iteration
second_iteration = calculate_new_pagerank(first_iteration, links)
print_ranks(second_iteration, 2)

# Find highest PageRank after second iteration
highest_page = max(second_iteration.items(), key=lambda x: x[1])
print(f"\nHighest PageRank after second iteration:")
print(f"Page {highest_page[0]} with PageRank value of {highest_page[1]:.3f}")

Web structure:
Page A links to: B
Page C links to: D
Page E links to: F
Page G links to: H, I
Page J links to: G

Initial PageRank Values:

PageRank values after iteration 0:
-----------------------------------
Page  |  PageRank Value
-----------------------------------
  A   |     0.100
  B   |     0.100
  C   |     0.100
  D   |     0.100
  E   |     0.100
  F   |     0.100
  G   |     0.100
  H   |     0.100
  I   |     0.100
  J   |     0.100
-----------------------------------

PageRank values after iteration 1:
-----------------------------------
Page  |  PageRank Value
-----------------------------------
  A   |     0.000
  B   |     0.100
  C   |     0.000
  D   |     0.100
  E   |     0.000
  F   |     0.100
  G   |     0.100
  H   |     0.050
  I   |     0.050
  J   |     0.000
-----------------------------------

PageRank values after iteration 2:
-----------------------------------
Page  |  PageRank Value
-----------------------------------
  A   |     0.000
  B   |     0.0