In [None]:
import requests
from bs4 import BeautifulSoup

Stopwords are used when building the inverted index. The inverted index will ignore stopwords.

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

STOPWORDS = stopwords.words('english')
print(STOPWORDS)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Add custom stopwords if you deem it necessary

In [None]:
custom_STOPWORDS = [] # Add your own stopwords here
STOPWORDS.extend(custom_STOPWORDS)

In [None]:
from collections import defaultdict

# Inverted index: word -> set of URLs
inverted_index = defaultdict(set)
url_list = set()

In [None]:
# This dictionary will be used to build the connection between links
web_connection = {'source':[], 'target':[]}

In [None]:
import re

# This function will clean the content of web page in order to build the inverted index.
def clean_and_tokenize(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())  # Remove punctuation and lowercase
    tokens = text.split()
    return [t for t in tokens if t not in STOPWORDS and len(t) > 1]

In [None]:
from urllib.parse import urljoin, urlparse

# The crawl function has 5 parameters
# url = The url to crawl
# base_domain = the base domain of the url. During crawling, the crawler will ignore links from other domains

def crawl(url, base_domain, visited, visit_limit, limit):
    if limit==0 or len(visited)==visit_limit:
        return

    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return
    except requests.RequestException:
        return

    visited.add(url)
    print("-"*(10-limit), end=" ")
    print(f"Crawled: {url}")

    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.get_text(separator=' ', strip=True)
    words = clean_and_tokenize(text)

    for word in words:
        inverted_index[word].add(url)
        url_list.add(url)

    # Recursively follow links
    for tag in soup.find_all('a', href=True):
        link = urljoin(url, tag['href'])
        parsed = urlparse(link)

        # Store external links as connection
        web_connection['source'].append(url)
        web_connection['target'].append(link)

        if parsed.netloc == base_domain and link not in visited:
            crawl(link, base_domain, visited, visit_limit, limit-1)

In [None]:
def crawl_roots(root_urls, max_per_root=2, visit_limit=75):
    for root in root_urls:
        print(f"\nStarting crawl from: {root}")
        domain = urlparse(root).netloc
        visited = set()
        crawl(root, domain, visited, visit_limit, max_per_root)

In [None]:
seed_urls =  [
    'https://www.espn.com/soccer/',
    'https://www.goal.com',
    'https://www.skysports.com/football',
    'https://www.bbc.com/sport/football',
    'https://www.football365.com',
    'https://www.fourfourtwo.com',
    'https://www.theguardian.com/football',
    'https://www.cbssports.com/soccer/',
    'https://www.90min.com',
    'https://www.squawka.com',
    'https://www.fifa.com/news',
    'https://www.uefa.com/news/',
    'https://www.premierleague.com/news',
    'https://www.laliga.com/en-GB/news',
    'https://www.bundesliga.com/en/news',
    'https://www.seriea.com/en/news',
    'https://www.mlssoccer.com/news/',
    'https://www.afc.com/news',
    'https://onefootball.com/en/news',
    'https://www.football-italia.net'
]

crawl_roots(seed_urls, max_per_root=10)


Starting crawl from: https://www.espn.com/soccer/

Starting crawl from: https://www.goal.com
 Crawled: https://www.goal.com
- Crawled: https://www.goal.com/en-us
-- Crawled: https://www.goal.com/en-us/live-scores
--- Crawled: https://www.goal.com/en-us/news
---- Crawled: https://www.goal.com/en-us/category/transfers/1/k94w8e1yy9ch14mllpf4srnks
----- Crawled: https://www.goal.com/en-us/category/opinion/1/bltda2eefda7fac61db
------ Crawled: https://www.goal.com/en-us/category/analysis/1/blt0e4843c7e245b533
------- Crawled: https://www.goal.com/en-us/category/power-rankings/1/blt262ce0e5159ea8fe
-------- Crawled: https://www.goal.com/en-us/category/player-ratings/1/blt9e3963966f918671
--------- Crawled: https://www.goal.com/en-us/category/winners-and-losers/1/blt05e54ed95ba7b0f8
--------- Crawled: https://www.goal.com/goalchampions
--------- Crawled: https://www.goal.com/goaleditions/2/index.html
--------- Crawled: https://www.goal.com/en-us/category/culture/1/edzsmxzmp4y81pp4ozi9lrdeh
-

In [None]:
# print inverted index
print("\nSample inverted index (first 20 words):")
for word in list(inverted_index.keys())[:20]:
    print(f"{word}: {list(inverted_index[word])}")


Sample inverted index (first 20 words):
soccer: ['https://www.theguardian.com/science', 'https://www.theguardian.com/podcasts', 'https://www.goal.com/en-us/team/barcelona/agh9ifb2mw3ivjusgedj7c3fe', 'https://www.cbssports.com/nfl/schedule/', 'https://www.mlssoccer.com/news/power-rankings-philadelphia-union-challenge-vancouver-whitecaps-top-spot', 'https://www.cbssports.com/nfl/teams/LAR/los-angeles-rams/', 'https://www.theguardian.com/crosswords', 'https://www.cbssports.com/soccer/', 'https://www.fourfourtwo.com/news/liverpool-report-historic-wage-breaking-deal-coming-for-alexander-isak-with-reds-looking-at-other-attackers', 'https://www.mlssoccer.com/schedule/scores', 'https://www.mlssoccer.com/stats/index', 'https://www.goal.com/en-us/concacaf-gold-cup/f51991ex45qhp1p3iu74u4d4e', 'https://www.cbssports.com/betting/news/nfl/', 'https://www.cbssports.com/nfl/teams/TEN/tennessee-titans/', 'https://www.theguardian.com/tone/recipes', 'https://www.goal.com/en-us/player/w-mckennie/288nn7pb

In [None]:
# Print first 20 connections

for source, target in list(zip(web_connection['source'], web_connection['target']))[:20]:
    print(f"{source} -> {target}")

https://www.goal.com -> https://www.goal.com/en-us
https://www.goal.com/en-us -> https://www.goal.com/en-us
https://www.goal.com/en-us -> https://www.goal.com/en-us/live-scores
https://www.goal.com/en-us/live-scores -> https://www.goal.com/en-us
https://www.goal.com/en-us/live-scores -> https://www.goal.com/en-us/live-scores
https://www.goal.com/en-us/live-scores -> https://www.goal.com/en-us/news
https://www.goal.com/en-us/news -> https://www.goal.com/en-us
https://www.goal.com/en-us/news -> https://www.goal.com/en-us/live-scores
https://www.goal.com/en-us/news -> https://www.goal.com/en-us/news
https://www.goal.com/en-us/news -> https://www.goal.com/en-us/category/transfers/1/k94w8e1yy9ch14mllpf4srnks
https://www.goal.com/en-us/category/transfers/1/k94w8e1yy9ch14mllpf4srnks -> https://www.goal.com/en-us
https://www.goal.com/en-us/category/transfers/1/k94w8e1yy9ch14mllpf4srnks -> https://www.goal.com/en-us/live-scores
https://www.goal.com/en-us/category/transfers/1/k94w8e1yy9ch14mllpf

In [None]:
import networkx as nx

web_graph = nx.DiGraph()
for source, target in zip(web_connection['source'], web_connection['target']):
    web_graph.add_edge(source, target) # Add edges for individual source-target pairs

In [None]:
len(web_graph.nodes)

20971

In [None]:
pagerank_scores = nx.pagerank(web_graph, alpha=0.85, max_iter=100, tol=1e-6)
print("\nPageRank Scores:", pagerank_scores)




In [None]:
def search_engine(query, index, scores):
    query_terms = query.lower().split()
    results = set()
    for term in query_terms:
        if term in index:
            if not results:
                results = set(index[term])
            else:
                results = results.intersection(index[term])  # Find common websites

    # Sort results based on score
    ranked_results = []
    for website in results:
        if website in scores:
          ranked_results.append((website, scores[website]))
    ranked_results.sort(key=lambda x: x[1], reverse=True)

    return ranked_results

In [139]:
# Query and display results
query = "Messi"
print(f"\nSearch Results for '{query}' using PageRank:")
results = search_engine(query, inverted_index, pagerank_scores) # Changed 'index' to 'inverted_index'

for page, score in results:
    print(f"{page}: {score}")  # Removed web_content[page] as web_content is not defined


print(f"\nSearch Results for '{query}' using HITS (Authorities):")
# Calculate HITS scores if needed
# authorities = nx.hits(web_graph)[1]  # Uncomment if you have HITS scores calculated
# results = search_engine(query, inverted_index, authorities)  # Uncomment if you have HITS scores calculated

# Placeholder for HITS results
# for page, score in results:
#    print(f"{page}: {score}") # Removed web_content[page] as web_content is not defined


Search Results for 'Messi' using PageRank:
https://www.90min.com/: 0.00011090849690231902
https://www.90min.com/fr: 0.00010882151229415566
https://www.90min.com/es: 0.00010882151229415566
https://www.90min.com/es/easports-fc-24: 0.00010485179222849334
https://www.90min.com/es/easports-fc-25: 0.00010338405560698886
https://www.90min.com/es/life-style: 0.00010017963243545609
https://www.fourfourtwo.com/features/fourfourtwo-worldwide: 9.700059782903743e-05
https://www.fourfourtwo.com/features/about-fourfourtwo: 9.700059782903743e-05
https://www.fourfourtwo.com/features/newsletter: 9.700059782903743e-05
https://www.fourfourtwo.com/features/fourfourtwo-magazine-pitching-guide: 9.700059782903743e-05
https://www.fourfourtwo.com/features/about-fourfourtwo#section-affiliate-advertising-disclosure: 9.520668730083708e-05
https://www.goal.com/en-us: 8.389950865912757e-05
https://www.skysports.com: 8.285370892184643e-05
https://www.goal.com/goalchampions: 8.195504896102651e-05
https://www.goal.com