In [1]:
import logging
import json
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

logging.basicConfig(format = '%(asctime)s %(levelname)s:%(message)s', level = logging.INFO)

class Crawler:

    def __init__(self, urls = [], limit = 500):
        
        self.visited_urls = []
        self.urls_to_visit = urls
        self.limit = limit
        return

    def download_url(self, url):
        try:
            return requests.get(url).text
        
        except Exception as E:
            return ''

    def get_linked_urls(self, url, html):
        soup = BeautifulSoup(html, 'html.parser')
        
        for link in soup.find_all('a'):
            
            path = link.get('href')
            
            if path and path.startswith('/'):
                path = urljoin(url, path)
            
            yield path

    def add_url_to_visit(self, url):
        total_crawled = len(self.visited_urls) + len(self.urls_to_visit)
        if total_crawled >= self.limit:
            return
        
        if url and url not in self.visited_urls and url not in self.urls_to_visit:
            # Why the if url?
            self.urls_to_visit.append(url)
            return

    def crawl(self, url):
        html = self.download_url(url)
        
        for url in self.get_linked_urls(url, html):
            self.add_url_to_visit(url)

    def run(self):
        while self.urls_to_visit:
            url = self.urls_to_visit.pop(0)
            
            logging.info(f' Fetching {url}')
            
            try:
                self.crawl(url)
            
            except Exception:
                logging.exception(f' Failed to fetch {url}')
            
            finally:
                self.visited_urls.append(url)

if __name__ == '__main__':

    crawler = Crawler(urls = ['https://oscars.org/'])
    crawler.run()

    display = False
    
    if display == True:
        print(crawler.visited_urls, flush = True)

    # Saving visited URLs
    with open('oscars_crawled_URLs_500.json', 'w+') as f:
        json.dump(crawler.visited_urls, f)

# No parallelism
# No retry mechanism
# No URL normalization ~ Homework: Find a python library function that does 'URL normalization'
# Ignores Robots.txt file

2021-10-18 20:42:38,188 INFO: Fetching https://oscars.org/
2021-10-18 20:42:40,942 INFO: Fetching #main-content
2021-10-18 20:42:40,943 INFO: Fetching #
2021-10-18 20:42:40,944 INFO: Fetching https://oscars.org/
2021-10-18 20:42:41,775 INFO: Fetching https://members.oscars.org/
2021-10-18 20:42:47,503 INFO: Fetching https://oscars.org/oscars
2021-10-18 20:42:49,803 INFO: Fetching https://oscars.org/governors
2021-10-18 20:42:50,824 INFO: Fetching https://oscars.org/sci-tech
2021-10-18 20:42:51,716 INFO: Fetching https://shop.oscars.org
2021-10-18 20:42:53,117 INFO: Fetching https://oscars.org/museum
2021-10-18 20:42:54,161 INFO: Fetching https://aframe.oscars.org?dr=https%3A%2F%2Foscars.org
2021-10-18 20:42:56,829 INFO: Fetching https://oscars.org/events
2021-10-18 20:42:57,857 INFO: Fetching https://oscars.org/collection-highlights
2021-10-18 20:43:00,032 INFO: Fetching https://oscars.org/videos-photos/academy-originals
2021-10-18 20:43:01,244 INFO: Fetching https://oscars.org/academy

2021-10-18 20:45:17,584 INFO: Fetching https://www.oscars.org/oscars/awards-databases-0
2021-10-18 20:45:18,261 INFO: Fetching https://oscars.org/governors
2021-10-18 20:45:19,146 INFO: Fetching https://oscars.org/governors/ceremonies/2020
2021-10-18 20:45:21,004 INFO: Fetching https://www.oscars.org/governors/ceremonies/2020
2021-10-18 20:45:21,214 INFO: Fetching https://oscars.org/governors/about
2021-10-18 20:45:22,975 INFO: Fetching https://www.oscars.org/governors/about
2021-10-18 20:45:23,971 INFO: Fetching https://oscars.org/governors/thalberg
2021-10-18 20:45:25,545 INFO: Fetching https://www.oscars.org/governors/thalberg
2021-10-18 20:45:26,436 INFO: Fetching https://oscars.org/governors/hersholt
2021-10-18 20:45:28,067 INFO: Fetching https://www.oscars.org/governors/hersholt
2021-10-18 20:45:28,357 INFO: Fetching https://oscars.org/governors/honorary
2021-10-18 20:45:29,722 INFO: Fetching https://www.oscars.org/governors/honorary
2021-10-18 20:45:30,647 INFO: Fetching https:/

2021-10-18 20:47:04,420 INFO: Fetching https://www.oscars.org/events/global-movie-day
2021-10-18 20:47:05,279 INFO: Fetching https://www.oscars.org/events/2020-scientific-technical-awards
2021-10-18 20:47:06,367 INFO: Fetching https://www.oscars.org/events/93rd-oscars-nominations-announcement
2021-10-18 20:47:07,251 INFO: Fetching https://www.oscars.org/content/fun-fact-2
2021-10-18 20:47:08,557 INFO: Fetching https://www.oscars.org/events/hedwig-and-angry-inch-2001-20th-anniversary-virtual-reunion
2021-10-18 20:47:09,887 INFO: Fetching https://www.oscars.org/content/fun-fact-3
2021-10-18 20:47:10,758 INFO: Fetching https://www.oscars.org/events/nfmla-academy-2021-latinx-and-hispanic-cinema
2021-10-18 20:47:11,766 INFO: Fetching https://www.oscars.org/content/fun-fact-4
2021-10-18 20:47:12,863 INFO: Fetching https://oscars.org/collection-highlights
2021-10-18 20:47:13,666 INFO: Fetching https://oscars.org/collection-highlights/richard-donner
2021-10-18 20:47:16,190 INFO: Fetching https

2021-10-18 20:48:16,180 INFO: Fetching https://www.oscars.org/vef/load/0d9526c37b4371d6c11273d6b74fe270?width=640&height=365&iframe=true
2021-10-18 20:48:16,748 INFO: Fetching https://www.oscars.org/vef/load/5496ac440d3e0ef850da0de52ea4112f?width=640&height=365&iframe=true
2021-10-18 20:48:17,447 INFO: Fetching https://www.oscars.org/vef/load/1bc1c09e9b6775ffff7287089c2280bb?width=640&height=365&iframe=true
2021-10-18 20:48:18,089 INFO: Fetching https://www.oscars.org/vef/load/703ffbdbe87be18e1036447e843d2c3f?width=640&height=365&iframe=true
2021-10-18 20:48:18,736 INFO: Fetching https://www.oscars.org/vef/load/916855fc9660e4dbed82df667a4b4665?width=640&height=365&iframe=true
2021-10-18 20:48:19,352 INFO: Fetching https://www.oscars.org/vef/load/352b88a026dbdf3639fae607e416f9dd?width=640&height=365&iframe=true
2021-10-18 20:48:19,948 INFO: Fetching https://www.oscars.org/vef/load/b6a75da66c8df75dab2b65bce75cf3bb?width=640&height=365&iframe=true
2021-10-18 20:48:20,519 INFO: Fetching ht

2021-10-18 20:48:55,976 INFO: Fetching https://www.oscars.org/vef/load/eb3e5127dcab5681ab7333d843674b0d?width=640&height=365&iframe=true
2021-10-18 20:48:56,629 INFO: Fetching https://www.oscars.org/vef/load/932d0f1faae2ff25db52ea3e1aa32e61?width=640&height=365&iframe=true
2021-10-18 20:48:56,809 INFO: Fetching https://www.oscars.org/vef/load/2694c5e99445acf0a208eda794df31c8?width=640&height=365&iframe=true
2021-10-18 20:48:57,401 INFO: Fetching https://www.oscars.org/vef/load/ccbc424d88942c4deaa2b211fb790a65?width=640&height=365&iframe=true
2021-10-18 20:48:58,058 INFO: Fetching https://www.oscars.org/vef/load/66b68175d33d2fb8a28e2735d18bd5db?width=640&height=365&iframe=true
2021-10-18 20:48:58,678 INFO: Fetching https://oscars.org/academy-story
2021-10-18 20:49:00,515 INFO: Fetching https://oscars.org/academy-story/1920--1929
2021-10-18 20:49:01,929 INFO: Fetching https://oscars.org/academy-story/1930--1939
2021-10-18 20:49:03,933 INFO: Fetching https://oscars.org/academy-story/1940-

2021-10-18 20:50:51,560 INFO: Fetching https://oscars.org/videos-photos
2021-10-18 20:50:52,244 INFO: Fetching https://oscars.org/videos-photos/officer-and-gentleman-ny
2021-10-18 20:50:52,766 INFO: Fetching https://oscars.org/videos-photos/enter-dragon
2021-10-18 20:50:53,419 INFO: Fetching https://oscars.org/videos-photos/86th-oscars-highlights
2021-10-18 20:50:55,227 INFO: Fetching https://oscars.org/videos-photos/academy-conversations
2021-10-18 20:50:55,945 INFO: Fetching https://oscars.org/videos-photos/academy-film-archive
2021-10-18 20:50:56,500 INFO: Fetching https://oscars.org/videos-photos/academy-womens-initiative-la-event
2021-10-18 20:50:57,034 INFO: Fetching https://oscars.org/videos-photos/shorts-nominees
2021-10-18 20:50:57,563 INFO: Fetching https://oscars.org/videos-photos/oscars-nominees-lunch-february-5-2018
2021-10-18 20:50:58,091 INFO: Fetching https://oscars.org/videos-photos/academy-academy
2021-10-18 20:50:58,640 INFO: Fetching https://oscars.org/videos-photos

2021-10-18 20:52:30,118 INFO: Fetching https://oscars.org/videos-photos/princess-bride-live-commentary
2021-10-18 20:52:31,196 INFO: Fetching https://oscars.org/videos-photos/silent-salon
2021-10-18 20:52:31,727 INFO: Fetching https://oscars.org/videos-photos/satyajit-rays-apu-trilogy
2021-10-18 20:52:32,960 INFO: Fetching https://oscars.org/videos-photos/shane-0
2021-10-18 20:52:33,483 INFO: Fetching https://oscars.org/videos-photos/academy-tribute-gabriel-figueroa
2021-10-18 20:52:34,009 INFO: Fetching https://oscars.org/videos-photos/real-reel-art-action
2021-10-18 20:52:35,606 INFO: Fetching https://oscars.org/videos-photos/closely-watched-trains
2021-10-18 20:52:36,650 INFO: Fetching https://oscars.org/videos-photos/amazing-medium-richard-williams
2021-10-18 20:52:37,193 INFO: Fetching https://oscars.org/videos-photos/shane
2021-10-18 20:52:37,838 INFO: Fetching https://oscars.org/videos-photos/z
2021-10-18 20:52:38,379 INFO: Fetching https://oscars.org/videos-photos/life-home-fro