# Homework 8

## Option 1 - URL analysis

### Cell 1 – Imports

In [11]:
import pandas as pd
import requests
from urllib.parse import urlparse, parse_qsl
import tldextract


### Cell 2 – URL Analysis Function

In [13]:
def analyze_url(url):
    parsed = urlparse(url)
    ext = tldextract.extract(url)

    path = parsed.path
    file_name = ""
    if path and path != "/":
        last_part = path.rstrip("/").split("/")[-1]
        if "." in last_part:
            file_name = last_part

    return {
        "URL": url,
        "Scheme": parsed.scheme,
        "TLD": ext.suffix,
        "Domain": ext.domain,
        "Subdomain": ext.subdomain,
        "Host": parsed.netloc,
        "Port": parsed.port,
        "Path": path,
        "File Name": file_name,
        "Query Parameters": dict(parse_qsl(parsed.query))
    }


### Cell 3 – robots.txt Fetching and Parsing

In [15]:
def fetch_robots(base_url):
    robots_url = base_url.rstrip("/") + "/robots.txt"
    try:
        response = requests.get(
            robots_url,
            headers={"User-Agent": "Mozilla/5.0"},
            timeout=10
        )
        if response.status_code == 200:
            return response.text
    except:
        pass
    return None


def parse_robots(text):
    user_agents = set()
    disallow = []
    crawl_delays = []

    current_agents = []

    for line in text.splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue

        if ":" not in line:
            continue

        key, value = line.split(":", 1)
        key = key.lower().strip()
        value = value.strip()

        if key == "user-agent":
            current_agents = [value]
            user_agents.add(value)

        elif key == "disallow" and value:
            disallow.append(value)

        elif key == "crawl-delay":
            try:
                crawl_delays.append(float(value))
            except:
                crawl_delays.append(value)

    return {
        "User-Agents": sorted(user_agents),
        "Disallowed Paths": disallow,
        "Crawl-Delay": crawl_delays
    }


### Cell 4 – Run Analysis on All URLs

In [17]:
urls = [
    "https://edition.cnn.com/2025/12/22/media/60-minutes-cecot-bari-weiss-canada-global-tv?iid=cnn_buildContentRecirc_end_recirc&recs_exp=most-popular-article-end&tenant_id=popular.en",
    "https://www.nhm.ac.uk/visit/exhibitions/wildlife-photographer-of-the-year.html",
    "https://is-web.hevra.haifa.ac.il/images/2025_SEM._aa.pdf"
]

url_results = []
robots_results = []

for url in urls:
    info = analyze_url(url)
    url_results.append(info)

    base = f"{info['Scheme']}://{info['Host']}"
    robots_text = fetch_robots(base)

    if robots_text:
        parsed = parse_robots(robots_text)
        robots_results.append({
            "Host": base,
            "robots.txt Exists": True,
            "User-Agents": parsed["User-Agents"],
            "Disallowed URLs": [base + p for p in parsed["Disallowed Paths"]],
            "Crawl-Delay": parsed["Crawl-Delay"]
        })
    else:
        robots_results.append({
            "Host": base,
            "robots.txt Exists": False,
            "User-Agents": [],
            "Disallowed URLs": [],
            "Crawl-Delay": []
        })

df_urls = pd.DataFrame(url_results)
df_robots = pd.DataFrame(robots_results)

df_urls


Unnamed: 0,URL,Scheme,TLD,Domain,Subdomain,Host,Port,Path,File Name,Query Parameters
0,https://edition.cnn.com/2025/12/22/media/60-mi...,https,com,cnn,edition,edition.cnn.com,,/2025/12/22/media/60-minutes-cecot-bari-weiss-...,,"{'iid': 'cnn_buildContentRecirc_end_recirc', '..."
1,https://www.nhm.ac.uk/visit/exhibitions/wildli...,https,ac.uk,nhm,www,www.nhm.ac.uk,,/visit/exhibitions/wildlife-photographer-of-th...,wildlife-photographer-of-the-year.html,{}
2,https://is-web.hevra.haifa.ac.il/images/2025_S...,https,ac.il,haifa,is-web.hevra,is-web.hevra.haifa.ac.il,,/images/2025_SEM._aa.pdf,2025_SEM._aa.pdf,{}


### Cell 5 – robots.txt Results

In [19]:
df_robots


Unnamed: 0,Host,robots.txt Exists,User-Agents,Disallowed URLs,Crawl-Delay
0,https://edition.cnn.com,True,"[*, AI2Bot, Ai2Bot-Dolma, Amazonbot, Applebot-...","[https://edition.cnn.com/, https://edition.cnn...",[]
1,https://www.nhm.ac.uk,True,[*],"[https://www.nhm.ac.uk/uksf-bin/, https://www....",[8.0]
2,https://is-web.hevra.haifa.ac.il,True,[*],[https://is-web.hevra.haifa.ac.il/administrato...,[]


## Summary – URL Analysis & Robots.txt

In this task, we implemented **Option 1: URL Analysis**.  
Each given URL was parsed and decomposed into its standard components, including **scheme, TLD, domain, subdomain, host, port, path, file name, and query parameters**.  
The extracted information was presented in a structured **DataFrame** for clarity and analysis.

In addition, we checked the existence of a **robots.txt** file for each URL host to ensure polite and responsible crawling behavior.  
When a robots.txt file was found, we extracted the **User-Agent rules**, the list of **disallowed paths (converted to full URLs)**, and the **crawl-delay** directive when available.

The results demonstrate correct understanding of URL structure and adherence to web crawling best practices, with all findings clearly summarized in tabular form.
