In [5]:
from urllib.parse import urlparse, urlunparse

def normalize_url(url):
    parsed_url = urlparse(url)
    scheme = parsed_url.scheme.lower()
    netloc = parsed_url.netloc.lower()
    normalized_parsed_url = parsed_url._replace(scheme=scheme, netloc=netloc)
    normalized_url = urlunparse(normalized_parsed_url)
    return normalized_url



urls_example = [
    # Normalized URLs
    "https://www.example.it",
    "https://www.example.it/alf",
    "https://www.example.es/alf",
    "https://www.example.es/riz",
    "https://www.example.com/riz",

    # Not Normalized URLs
    "https://www.example.it",
    "https://www.example.it/alf",
    "https://www.example.es/alf",
    "http://www.example.es/riz?id=123&ref=homepage",
    "http://www.example.com/riz?q=product&sort=price"
]

# normalize_url('http://www.example.com/search?q=product&sort=price')

In [6]:
"""
* This function counts how many unique normalized valid URLs were passed to the function
*
* Accepts a list of URLs
*
* Example:
*
* input: ['https://example.com']
* output: 1
*
* Notes:
*  - assume none of the URLs have authentication information (username, password).
*
* Normalized URL:
*  - process in which a URL is modified and standardized: https://en.wikipedia.org/wiki/URL_normalization
*
#    For example.
#    These 2 urls are the same:
#    input: ["https://example.com", "https://example.com/"]
#    output: 1
#
#    These 2 are not the same:
#    input: ["https://example.com", "http://example.com"]
#    output 2
#
#    These 2 are the same:
#    input: ["https://example.com?", "https://example.com"]
#    output: 1
#
#    These 2 are the same:
#    input: ["https://example.com?a=1&b=2", "https://example.com?b=2&a=1"]
#    output: 1
"""

def count_unique_urls(urls: list[str]) -> int:
    unique_urls = []
    while len(urls)>0:
        duplic_indexes = [0]
        for i in range(1,len(urls)):
            # print(normalize_url(urls[0]))
            if normalize_url(urls[0]) == normalize_url(urls[i]):
                duplic_indexes.append(i)
        unique_urls.append(normalize_url(urls[0]))
        urls = [url for index, url in enumerate(urls) if index not in duplic_indexes] 
    return len(unique_urls), unique_urls


count_unique_urls(urls_example)

(7,
 ['https://www.example.it',
  'https://www.example.it/alf',
  'https://www.example.es/alf',
  'https://www.example.es/riz',
  'https://www.example.com/riz',
  'http://www.example.es/riz?id=123&ref=homepage',
  'http://www.example.com/riz?q=product&sort=price'])

In [39]:
"""
 * This function counts how many unique normalized valid URLs were passed to the function per top level domain
 *
 * A top level domain is a domain in the form of example.com. Assume all top level domains end in .com
 * subdomain.example.com is not a top level domain.
 *
 * Accepts a list of URLs
 *
 * Example:
 *
 * input: ["https://example.com"]
 * output: Hash["example.com" => 1]
 *
 * input: ["https://example.com", "https://subdomain.example.com"]
 * output: Hash["example.com" => 2]
 *
"""

from collections import defaultdict

def count_unique_urls_per_top_level_domain(urls: list[str]) -> Dict[str, int]:
    
    unique_urls_dom = defaultdict(int)
    _, unique_urls = count_unique_urls(urls)
    
    for url in unique_urls:
        domain = urlparse(url).netloc
        unique_urls_dom[domain] += 1
    
    return unique_urls_dom

count_unique_urls_per_top_level_domain(urls_example)

defaultdict(int,
            {'www.example.it': 2, 'www.example.es': 3, 'www.example.com': 2})