In [1]:
import logging
import json
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

logging.basicConfig(format = '%(asctime)s %(levelname)s:%(message)s', level = logging.INFO)

class Crawler:

    def __init__(self, urls = [], limit = 500):
        
        self.visited_urls = []
        self.urls_to_visit = urls
        self.limit = limit
        return

    def download_url(self, url):
        try:
            return requests.get(url).text
        
        except Exception as E:
            return ''

    def get_linked_urls(self, url, html):
        soup = BeautifulSoup(html, 'html.parser')
        
        for link in soup.find_all('a'):
            
            path = link.get('href')
            
            if path and path.startswith('/'):
                path = urljoin(url, path)
            
            yield path

    def add_url_to_visit(self, url):
        total_crawled = len(self.visited_urls) + len(self.urls_to_visit)
        if total_crawled >= self.limit:
            return
        
        if url and url not in self.visited_urls and url not in self.urls_to_visit:
            # Why the if url?
            self.urls_to_visit.append(url)
            return

    def crawl(self, url):
        html = self.download_url(url)
        
        for url in self.get_linked_urls(url, html):
            self.add_url_to_visit(url)

    def run(self):
        while self.urls_to_visit:
            url = self.urls_to_visit.pop(0)
            
            logging.info(f' Fetching {url}')
            
            try:
                self.crawl(url)
            
            except Exception:
                logging.exception(f' Failed to fetch {url}')
            
            finally:
                self.visited_urls.append(url)

if __name__ == '__main__':

    crawler = Crawler(urls = ['https://www.oscars.org/oscars/ceremonies/2021'])
    crawler.run()

    display = False
    
    if display == True:
        print(crawler.visited_urls, flush = True)

    # Saving visited URLs
    with open('oscars_2021_crawled_URLs_500.json', 'w+') as f:
        json.dump(crawler.visited_urls, f)

# No parallelism
# No retry mechanism
# No URL normalization ~ Homework: Find a python library function that does 'URL normalization'
# Ignores Robots.txt file

2021-10-18 21:19:21,030 INFO: Fetching https://www.oscars.org/oscars/ceremonies/2021
2021-10-18 21:19:21,300 INFO: Fetching #main-content
2021-10-18 21:19:21,301 INFO: Fetching #
2021-10-18 21:19:21,302 INFO: Fetching https://www.oscars.org/
2021-10-18 21:19:21,466 INFO: Fetching https://members.oscars.org/
2021-10-18 21:19:23,879 INFO: Fetching https://www.oscars.org/oscars
2021-10-18 21:19:24,055 INFO: Fetching https://www.oscars.org/governors
2021-10-18 21:19:24,211 INFO: Fetching https://www.oscars.org/sci-tech
2021-10-18 21:19:24,386 INFO: Fetching https://shop.oscars.org
2021-10-18 21:19:24,631 INFO: Fetching https://www.oscars.org/museum
2021-10-18 21:19:24,801 INFO: Fetching https://aframe.oscars.org?dr=https%3A%2F%2Foscars.org
2021-10-18 21:19:25,133 INFO: Fetching https://www.oscars.org/events
2021-10-18 21:19:25,288 INFO: Fetching https://www.oscars.org/collection-highlights
2021-10-18 21:19:25,446 INFO: Fetching https://www.oscars.org/videos-photos/academy-originals
2021-10

2021-10-18 21:20:25,970 INFO: Fetching https://www.oscars.org/oscars/ceremonies/1992
2021-10-18 21:20:26,652 INFO: Fetching https://www.oscars.org/oscars/ceremonies/1993
2021-10-18 21:20:26,943 INFO: Fetching https://www.oscars.org/oscars/ceremonies/1994
2021-10-18 21:20:27,340 INFO: Fetching https://www.oscars.org/oscars/ceremonies/1995
2021-10-18 21:20:27,645 INFO: Fetching https://www.oscars.org/oscars/ceremonies/1996
2021-10-18 21:20:27,924 INFO: Fetching https://www.oscars.org/oscars/ceremonies/1997
2021-10-18 21:20:28,249 INFO: Fetching https://www.oscars.org/oscars/ceremonies/1998
2021-10-18 21:20:28,549 INFO: Fetching https://www.oscars.org/oscars/ceremonies/1999
2021-10-18 21:20:30,219 INFO: Fetching https://www.oscars.org/oscars/ceremonies/2001
2021-10-18 21:20:30,441 INFO: Fetching https://www.oscars.org/oscars/ceremonies/2002
2021-10-18 21:20:30,753 INFO: Fetching https://www.oscars.org/oscars/ceremonies/2003
2021-10-18 21:20:31,047 INFO: Fetching https://www.oscars.org/osc

2021-10-18 21:21:53,570 INFO: Fetching https://www.oscars.org/sites/oscars/files/2022_govawards_web_1920x1040_2_custom_0.png
2021-10-18 21:22:01,890 INFO: Fetching https://www.oscars.org/news/academy-honor-danny-glover-samuel-l-jackson-elaine-may-and-liv-ullmann-oscarsr-2022-governors
2021-10-18 21:22:02,637 INFO: Fetching https://www.oscars.org/sites/oscars/files/nm2021_desktop_2560x1440_seal.png
2021-10-18 21:22:06,548 INFO: Fetching https://oscars.org/newmembers2021/
2021-10-18 21:22:07,361 INFO: Fetching https://www.oscars.org/sites/oscars/files/aframe_homepage_v1.jpg
2021-10-18 21:22:08,678 INFO: Fetching https://aframe.oscars.org/
2021-10-18 21:22:08,863 INFO: Fetching https://www.oscars.org/sites/oscars/files/aperture_homepage_v2.jpg
2021-10-18 21:22:11,470 INFO: Fetching https://www.oscars.org/news/academy-announces-next-phase-equity-and-inclusion-initiatives
2021-10-18 21:22:11,598 INFO: Fetching https://www.oscars.org/sites/oscars/files/academydialogues_homepage_02_1.jpg
2021

2021-10-18 21:22:37,742 INFO: Fetching https://aframe.oscars.org/news/post/boom-dune-and-raccoon-city-this-week-in-movie-trailers
2021-10-18 21:22:37,845 INFO: Fetching https://aframe.oscars.org/lists/director-john-ridleys-6-movie-picks-that-move-the-needle
2021-10-18 21:22:37,973 INFO: Fetching https://aframe.oscars.org/lists/6-movies-that-helped-shape-mass
2021-10-18 21:22:38,207 INFO: Fetching https://aframe.oscars.org/lists/tim-blake-nelson-my-6-favorite-westerns
2021-10-18 21:22:38,368 INFO: Fetching https://aframe.oscars.org/lists/writer-director-hagai-levis-five-favorite-bergman-works
2021-10-18 21:22:38,515 INFO: Fetching https://aframe.oscars.org/news/post/questloves-unlikely-trip-back-to-summer-of-soul
2021-10-18 21:22:38,626 INFO: Fetching https://aframe.oscars.org/news/post/short-takes-animated-films
2021-10-18 21:22:38,737 INFO: Fetching https://aframe.oscars.org/news/post/dru-and-denver-help-us-re-discover-best-picture-a-beautiful-mind
2021-10-18 21:22:38,844 INFO: Fetchi

2021-10-18 21:23:25,671 INFO: Fetching https://www.oscars.org/vef/load/9bde9d66695842108c5a0eb4197c97a0?width=640&height=365&iframe=true
2021-10-18 21:23:25,817 INFO: Fetching https://www.oscars.org/vef/load/2b439f266f25e39c525afd18a48b7609?width=640&height=365&iframe=true
2021-10-18 21:23:25,962 INFO: Fetching https://www.oscars.org/vef/load/4e3de3b371402f20f35bce46f3ce654f?width=640&height=365&iframe=true
2021-10-18 21:23:26,108 INFO: Fetching https://www.oscars.org/vef/load/8fa68477395c576d506f368c250bc74a?width=640&height=365&iframe=true
2021-10-18 21:23:26,258 INFO: Fetching https://www.oscars.org/vef/load/3d45261575854de70b920fb4351130a9?width=640&height=365&iframe=true
2021-10-18 21:23:26,400 INFO: Fetching https://www.oscars.org/vef/load/1084b41c435c6ccb32ed560957a0a36a?width=640&height=365&iframe=true
2021-10-18 21:23:26,545 INFO: Fetching https://www.oscars.org/vef/load/4483a50eebc26cbab983d68153a39047?width=640&height=365&iframe=true
2021-10-18 21:23:26,695 INFO: Fetching ht

2021-10-18 21:23:34,489 INFO: Fetching https://www.oscars.org/vef/load/6ff2908a77ab8db15ab609437259aa47?width=640&height=365&iframe=true
2021-10-18 21:23:34,631 INFO: Fetching https://www.oscars.org/vef/load/761a4c653df90f1a5c437ffb59e44256?width=640&height=365&iframe=true
2021-10-18 21:23:34,767 INFO: Fetching https://www.oscars.org/vef/load/de55c0f91f88b035abb402647ceccdcb?width=640&height=365&iframe=true
2021-10-18 21:23:34,909 INFO: Fetching https://www.oscars.org/vef/load/29e156b4bcf452fd73ca75445e91a70e?width=640&height=365&iframe=true
2021-10-18 21:23:35,052 INFO: Fetching https://www.oscars.org/vef/load/efdb302123e6474669b84e8aae18e26f?width=640&height=365&iframe=true
2021-10-18 21:23:35,197 INFO: Fetching https://www.oscars.org/vef/load/b109a6d0a9e8720e1697317092d23e74?width=640&height=365&iframe=true
2021-10-18 21:23:35,348 INFO: Fetching https://www.oscars.org/vef/load/49233347af118f52fed50a58cf1d0345?width=640&height=365&iframe=true
2021-10-18 21:23:35,494 INFO: Fetching ht

2021-10-18 21:24:03,335 INFO: Fetching https://www.oscars.org/news/will-packer-produce-94th-oscarsr#disqus_thread
2021-10-18 21:24:03,489 INFO: Fetching https://www.oscars.org/news/academy-partners-filmaid-filmmaking-conversation-series
2021-10-18 21:24:04,753 INFO: Fetching https://www.oscars.org/news/academy-partners-filmaid-filmmaking-conversation-series#disqus_thread
2021-10-18 21:24:04,913 INFO: Fetching https://www.oscars.org/news/meet-2021-student-academy-awards-finalists
2021-10-18 21:24:05,127 INFO: Fetching https://www.oscars.org/news/meet-2021-student-academy-awards-finalists#disqus_thread
2021-10-18 21:24:05,275 INFO: Fetching https://www.oscars.org/news/david-rubin-re-elected-academy-president-academy-elects-2021-2022-board-governors
2021-10-18 21:24:06,593 INFO: Fetching https://www.oscars.org/news/david-rubin-re-elected-academy-president-academy-elects-2021-2022-board-governors#disqus_thread
2021-10-18 21:24:06,757 INFO: Fetching https://www.oscars.org/news/academy-annou