<a href="https://colab.research.google.com/github/ahmed010tanvir/Data-Mining-and-warehouse-Lab/blob/master/Project_3_Building_a_Domain_Specific_Search_Engine_with_Crawling_and_Link_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import networkx as nx
import numpy as np
import pandas as pd

Stopwords are used when building the inverted index. The inverted index will ignore stopwords.

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

STOPWORDS = stopwords.words('english')
print(STOPWORDS)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Add custom stopwords if you deem it necessary

In [None]:
custom_STOPWORDS = [] # Add your own stopwords here
STOPWORDS.extend(custom_STOPWORDS)

In [None]:
from collections import defaultdict

# Inverted index: word -> set of URLs
inverted_index = defaultdict(set)
url_list = set()

In [None]:
# This dictionary will be used to build the connection between links
web_connection = {'source':[], 'target':[]}

In [None]:
import re

# This function will clean the content of web page in order to build the inverted index.
def clean_and_tokenize(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())  # Remove punctuation and lowercase
    tokens = text.split()
    return [t for t in tokens if t not in STOPWORDS and len(t) > 1]

In [None]:
from urllib.parse import urljoin, urlparse

# The crawl function has 5 parameters
# url = The url to crawl
# base_domain = the base domain of the url. During crawling, the crawler will ignore links from other domains

def crawl(url, base_domain, visited, visit_limit, limit):
    if limit==0 or len(visited)==visit_limit:
        return

    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return
    except requests.RequestException:
        return

    visited.add(url)
    print("-"*(10-limit), end=" ")
    print(f"Crawled: {url}")

    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.get_text(separator=' ', strip=True)
    words = clean_and_tokenize(text)

    for word in words:
        inverted_index[word].add(url)
        url_list.add(url)

    # Recursively follow links
    for tag in soup.find_all('a', href=True):
        link = urljoin(url, tag['href'])
        parsed = urlparse(link)

        # Store external links as connection
        web_connection['source'].append(url)
        web_connection['target'].append(link)

        if parsed.netloc == base_domain and link not in visited:
            crawl(link, base_domain, visited, visit_limit, limit-1)

In [None]:
def crawl_roots(root_urls, max_per_root=2, visit_limit=50):
    for root in root_urls:
        print(f"\nStarting crawl from: {root}")
        domain = urlparse(root).netloc
        visited = set()
        crawl(root, domain, visited, visit_limit, max_per_root)

In [None]:
seed_urls = [
    'https://cs.ut.ee/en/content/computer-science-msc',  # University of Tartu, Estonia
    'https://sigarra.up.pt/fcup/en/CUR_GERAL.CUR_VIEW?pv_ano_lectivo=2025&pv_curso_id=876',  # University of Porto, Portugal
    'https://informatorects.uw.edu.pl/en/programmes-all/IN/',  # University of Warsaw, Poland
    'https://www.mff.cuni.cz/en/students/master-of-computer-science',  # Charles University, Czech Republic
    'https://studieren.univie.ac.at/en/degree-programmes/master-programmes/computer-science-master/',  # University of Vienna, Austria
    'https://www.ub.edu/web/portal/en/estudis/oferta_formativa/master_university_programmes/masters/computer_science.html',  # University of Barcelona, Spain
    'https://apply.unipd.it/courses/course/63-computer-science',  # University of Padua
    'https://www.fri.uni-lj.si/en/',  # University of Ljubljana, Slovenia
    'https://en.gs.sjtu.edu.cn/Admission1/Degree_Programs/Master.htm',  # Shanghai Jiao Tong University, China
    'https://iczu.zju.edu.cn/admissionsen/wasterwswwegreewwrograms/list.htm',  # Zhejiang University, China
]


crawl_roots(seed_urls, max_per_root=10)


Starting crawl from: https://cs.ut.ee/en/content/computer-science-msc
 Crawled: https://cs.ut.ee/en/content/computer-science-msc
- Crawled: https://cs.ut.ee/en/content/computer-science-msc#main-content
-- Crawled: https://cs.ut.ee/et
--- Crawled: https://cs.ut.ee/et#main-content
---- Crawled: https://cs.ut.ee/en
----- Crawled: https://cs.ut.ee/en#main-content
------ Crawled: https://cs.ut.ee/en/deans-office-faculty-arts-and-humanities
------- Crawled: https://cs.ut.ee/en/deans-office-faculty-arts-and-humanities#main-content
-------- Crawled: https://cs.ut.ee/et/humanitaarteaduste-ja-kunstide-valdkonna-dekanaat
--------- Crawled: https://cs.ut.ee/et/humanitaarteaduste-ja-kunstide-valdkonna-dekanaat#main-content
--------- Crawled: https://cs.ut.ee/cdn-cgi/l/email-protection#b4dcc29ad0d1dfd5dad5d5c0f4c1c09ad1d1
--------- Crawled: https://cs.ut.ee/et/ajaloo-ja-arheoloogia-instituut
--------- Crawled: https://cs.ut.ee/cdn-cgi/l/email-protection#9cfdf6fdf0e9fbe9dce9e8b2f9f9
--------- Crawle

In [None]:
# print inverted index
print("\nSample inverted index (first 20 words):")
for word in list(inverted_index.keys())[:20]:
    print(f"{word}: {list(inverted_index[word])}")


Sample inverted index (first 20 words):
computer: ['https://informatorects.uw.edu.pl/pl/courses/', 'https://sigarra.up.pt/fcup/en/WEB_BASE.GERA_PAGINA?P_pagina=1182', 'https://www.mff.cuni.cz/en/students/master-of-computer-science', 'https://apply.unipd.it/courses/course/63-computer-science', 'https://www.fri.uni-lj.si/sl/studijski-program/multimedija-0', 'https://sigarra.up.pt/fcup/en/CUR_GERAL.CUR_VIEW?pv_ano_lectivo=2025&pv_curso_id=876', 'https://cs.ut.ee/en/deans-office-faculty-arts-and-humanities', 'https://apply.unipd.it/courses/course/90-computer-engineering', 'https://iczu.zju.edu.cn/admissionsen/main.htm', 'https://apply.unipd.it/courses/course/226-ingegneria-dellautomazione-e-dei-sistemi', 'https://apply.unipd.it/courses/course/121-ingegneria-biomedica', 'https://studieren.univie.ac.at/en/entrance-exam/test-details/', 'https://cs.ut.ee/en/content/computer-science-msc', 'http://iczu.zju.edu.cn/admissions', 'https://studieren.univie.ac.at/en/degree-programmes/master-programme

In [None]:
# Print first 20 connections

for source, target in list(zip(web_connection['source'], web_connection['target']))[:20]:
    print(f"{source} -> {target}")

https://cs.ut.ee/en/content/computer-science-msc -> https://cs.ut.ee/en/content/computer-science-msc#main-content
https://cs.ut.ee/en/content/computer-science-msc#main-content -> https://cs.ut.ee/en/content/computer-science-msc#main-content
https://cs.ut.ee/en/content/computer-science-msc#main-content -> https://ut.ee/en
https://cs.ut.ee/en/content/computer-science-msc#main-content -> https://ut.ee/en
https://cs.ut.ee/en/content/computer-science-msc#main-content -> https://cs.ut.ee/en/content/computer-science-msc
https://cs.ut.ee/en/content/computer-science-msc#main-content -> https://siseveeb.ut.ee/en
https://cs.ut.ee/en/content/computer-science-msc#main-content -> https://ois.ut.ee/
https://cs.ut.ee/en/content/computer-science-msc#main-content -> https://ois2.ut.ee/#/dashboard
https://cs.ut.ee/en/content/computer-science-msc#main-content -> https://moodle.ut.ee/?lang=en
https://cs.ut.ee/en/content/computer-science-msc#main-content -> https://utlib.ut.ee/en
https://cs.ut.ee/en/content

In [None]:
web_graph = nx.DiGraph()
for source, target in zip(web_connection['source'], web_connection['target']):
  web_graph.add_edge(source, target) # Iterate through source and target lists using zip

In [None]:
pagerank_scores = nx.pagerank(web_graph, alpha=0.85, max_iter=100, tol=1e-6)
print("\nPageRank Scores:", pagerank_scores)


PageRank Scores: {'https://cs.ut.ee/en/content/computer-science-msc': 9.121899078569716e-05, 'https://cs.ut.ee/en/content/computer-science-msc#main-content': 8.44849283082666e-05, 'https://ut.ee/en': 8.552936517412828e-05, 'https://siseveeb.ut.ee/en': 8.552936517412828e-05, 'https://ois.ut.ee/': 9.121899078569716e-05, 'https://ois2.ut.ee/#/dashboard': 9.121899078569716e-05, 'https://moodle.ut.ee/?lang=en': 8.552936517412828e-05, 'https://utlib.ut.ee/en': 9.121899078569716e-05, 'https://stiiliraamat.ut.ee/?lang=en': 9.121899078569716e-05, 'https://ut.ee/en/content/accessibility': 8.552936517412828e-05, 'https://cs.ut.ee/et': 9.070148779987037e-05, 'https://cs.ut.ee/et#main-content': 8.446310417831967e-05, 'https://ut.ee/et': 8.963928802134636e-05, 'https://siseveeb.ut.ee/': 8.963928802134636e-05, 'https://moodle.ut.ee/': 8.963928802134636e-05, 'https://www.ajakiri.ut.ee': 8.963928802134636e-05, 'https://ut.ee/et/sisu/juurdepaasetavus': 8.963928802134636e-05, 'https://cs.ut.ee/en': 8.60

In [None]:
def search_engine(query, index, scores):
    query_terms = query.lower().split()
    results = set()
    for term in query_terms:
        if term in index:
            if not results:
                results = set(index[term])
            else:
                results = results.intersection(index[term])  # Find common websites

    # Sort results based on score
    ranked_results = []
    for website in results:
        if website in scores:
          ranked_results.append((website, scores[website]))
    ranked_results.sort(key=lambda x: x[1], reverse=True)

    return ranked_results

In [None]:
# Query and display results
query = "Master of Computer Science"
print(f"\nSearch Results for '{query}' using PageRank:")
results = search_engine(query, inverted_index, pagerank_scores)

for page, score in results:
    print(f"{page}: ({score})")


Search Results for 'Master of Computer Science' using PageRank:
https://iczu.zju.edu.cn/admissionsen/main.htm: (0.0002165340303912067)
https://iczu.zju.edu.cn/admissionsen/wwwwatwawwlance/list.htm: (0.0002165340303912067)
https://sigarra.up.pt/fcup/en/cur_geral.cur_inicio: (0.00015513276401225162)
https://studieren.univie.ac.at/en/entrance-exam/test-details/: (0.0001322620274591458)
https://studieren.univie.ac.at/en/entrance-exam/exemptions-from-entrance-exam-procedure/: (0.0001322620274591458)
https://studieren.univie.ac.at/en/entrance-exam/faq/: (0.0001322620274591458)
https://studieren.univie.ac.at/en/entrance-exam/: (0.0001322620274591458)
https://studieren.univie.ac.at/en/degree-programmes/teacher-education-programme/: (0.0001322620274591458)
https://studieren.univie.ac.at/en/degree-programmes/bachelordiploma-programmes/: (0.0001322620274591458)
https://studieren.univie.ac.at/en/degree-programmes/master-programmes/: (0.0001322620274591458)
https://studieren.univie.ac.at/en/admiss