In [5]:
# !pip install requests beautifulsoup4

In [6]:
import requests
from bs4 import BeautifulSoup
import re

# Ergebnisse
results = {
    "visited_sites": [],
    "violations": {},
    "errors": []
}


In [9]:
def fetch_html(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        results["errors"].append((url, str(e)))  # save errors
        return None

# Search function
def extract_links(html, base_url):
    soup = BeautifulSoup(html, 'html.parser')
    links = set()
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        if href.startswith('http'):
            links.add(href)
        elif href.startswith('/'):  # Relative-link
            links.add(base_url + href)
    return links

# function for analyzing copyright violations
def check_copyright_violations(content):
    patterns = [
        r"©\s?\d{4}",            # year with copyright symbol
        r"All rights reserved",  # copyrights text
    ]
    for pattern in patterns:
        if re.search(pattern, content, re.IGNORECASE):
            return True
    return False

# main_function of the crawler
def web_crawler(start_url, max_depth=2):
    visited_urls = set()
    to_visit = [(start_url, 0)]  # (URL, Depth)

    while to_visit:
        current_url, depth = to_visit.pop(0)
        if depth > max_depth or current_url in visited_urls:
            continue

        print(f"Visiting: {current_url} (Depth {depth})")
        visited_urls.add(current_url)
        results["visited_sites"].append(current_url)

        html = fetch_html(current_url)
        if not html:
            continue

        if start_url not in current_url and check_copyright_violations(html):
            print(f"Copyright violation detected on: {current_url}")
            results["violations"][current_url] = "Potential violation detected"

        links = extract_links(html, base_url=current_url)
        # Ignore links if extract_links = start_url
        to_visit.extend([(link, depth + 1) for link in links if link not in visited_urls and start_url not in link])

# Investigate URL link
start_url = "https://anas-mohammad.net"
web_crawler(start_url)

# results overview
print("\n+++++ Results Analysis +++++")
print("1+ Searched websites:")
for site in results["visited_sites"]:
    print(f"- {site}")

print("\n2+ Found copyright violations:")
if results["violations"]:
    for site, info in results["violations"].items():
        print(f"- {site}: {info}")
else:
    print("- No violations found.")

print("\n3+. Errors during fetching:")
if results["errors"]:
    for url, error in results["errors"]:
        print(f"- {url}: {error}")
else:
    print("- No errors during fetching.")


Visiting: https://anas-mohammad.net (Depth 0)
Visiting: https://www.google.com/url?q=https%3A%2F%2Fgithub.com%2Fanasm20&sa=D&sntz=1&usg=AOvVaw2aF7s-qMGjNp9yzaJITugl (Depth 1)
Visiting: https://www.google.com/url?q=https%3A%2F%2Fwww.linkedin.com%2F&sa=D&sntz=1&usg=AOvVaw27zHZd9hlyIAn1w47VRLrD (Depth 1)

+++++ Results Analysis +++++
1- Searched websites:
- https://anas-mohammad.net
- https://www.google.com/url?q=https%3A%2F%2Fgithub.com%2Fanasm20&sa=D&sntz=1&usg=AOvVaw2aF7s-qMGjNp9yzaJITugl
- https://www.google.com/url?q=https%3A%2F%2Fwww.linkedin.com%2F&sa=D&sntz=1&usg=AOvVaw27zHZd9hlyIAn1w47VRLrD
- https://anas-mohammad.net
- https://www.google.com/url?q=https%3A%2F%2Fgithub.com%2Fanasm20&sa=D&sntz=1&usg=AOvVaw2aF7s-qMGjNp9yzaJITugl
- https://www.google.com/url?q=https%3A%2F%2Fwww.linkedin.com%2F&sa=D&sntz=1&usg=AOvVaw27zHZd9hlyIAn1w47VRLrD
- https://anas-mohammad.net
- https://www.google.com/url?q=https%3A%2F%2Fgithub.com%2Fanasm20&sa=D&sntz=1&usg=AOvVaw2aF7s-qMGjNp9yzaJITugl
- https: