In [2]:
import logging
import json
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

logging.basicConfig(format = '%(asctime)s %(levelname)s:%(message)s', level = logging.INFO)

class Crawler:

    def __init__(self, urls = [], limit = 300):
        
        self.visited_urls = []
        self.urls_to_visit = urls
        self.limit = limit
        return

    def download_url(self, url):
        try:
            return requests.get(url).text
        
        except Exception as E:
            return ''

    def get_linked_urls(self, url, html):
        soup = BeautifulSoup(html, 'html.parser')
        
        for link in soup.find_all('a'):
            
            path = link.get('href')
            
            if path and path.startswith('/'):
                path = urljoin(url, path)
            
            yield path

    def add_url_to_visit(self, url):
        total_crawled = len(self.visited_urls) + len(self.urls_to_visit)
        if total_crawled >= self.limit:
            return
        
        if url and url not in self.visited_urls and url not in self.urls_to_visit:
            # Why the if url?
            self.urls_to_visit.append(url)
            return

    def crawl(self, url):
        html = self.download_url(url)
        
        for url in self.get_linked_urls(url, html):
            self.add_url_to_visit(url)

    def run(self):
        while self.urls_to_visit:
            url = self.urls_to_visit.pop(0)
            
            logging.info(f' Fetching {url}')
            
            try:
                self.crawl(url)
            
            except Exception:
                logging.exception(f' Failed to fetch {url}')
            
            finally:
                self.visited_urls.append(url)

if __name__ == '__main__':

    crawler = Crawler(urls = ['https://www.cfsindia.org/category/catalogue/'])
    crawler.run()

    display = False
    
    if display == True:
        print(crawler.visited_urls, flush = True)

    # Saving visited URLs
    with open('cfsindia_crawled_URLs_500.json', 'w+') as f:
        json.dump(crawler.visited_urls, f)

# No parallelism
# No retry mechanism
# No URL normalization ~ Homework: Find a python library function that does 'URL normalization'
# Ignores Robots.txt file

2021-10-23 20:59:24,287 INFO: Fetching https://www.cfsindia.org/category/catalogue/
2021-10-23 20:59:25,173 INFO: Fetching https://www.cfsindia.org/
2021-10-23 20:59:26,072 INFO: Fetching https://www.cfsindia.org/screen-reader-access/
2021-10-23 20:59:26,923 INFO: Fetching javascript:void(0);
2021-10-23 20:59:26,924 INFO: Fetching #Link
2021-10-23 20:59:26,926 INFO: Fetching http://cfsindia.org/about-cfsi/who-we-are/
2021-10-23 20:59:29,004 INFO: Fetching http://cfsindia.org/about-cfsi/what-we-do/
2021-10-23 20:59:31,019 INFO: Fetching #!
2021-10-23 20:59:31,021 INFO: Fetching http://www.cfsindia.org/category/catalogue/
2021-10-23 20:59:32,391 INFO: Fetching  http://cfsindia.org/film-festival/
2021-10-23 20:59:34,585 INFO: Fetching http://cfsindia.org/contact/
2021-10-23 20:59:37,591 INFO: Fetching http://cfsindia.org/category/catalogue/
2021-10-23 20:59:39,767 INFO: Fetching https://www.cfsindia.org/wp-content/uploads/2020/03/buy-cfsi-films-dvd.pdf
2021-10-23 20:59:45,520 INFO: Fetchi

2021-10-23 21:03:44,002 INFO: Fetching http://cfsindia.org/category/catalogue/
2021-10-23 21:03:46,122 INFO: Fetching http://cfsindia.org/
2021-10-23 21:03:48,635 INFO: Fetching http://cfsindia.org/the-cake-story/
2021-10-23 21:03:51,001 INFO: Fetching #
2021-10-23 21:03:51,002 INFO: Fetching https://www.cfsindia.org/nani-teri-morni/
2021-10-23 21:03:51,816 INFO: Fetching https://www.cfsindia.org/tennis-buddies/
2021-10-23 21:03:52,624 INFO: Fetching https://www.cfsindia.org/pinti-ka-sabun/
2021-10-23 21:03:53,454 INFO: Fetching https://www.cfsindia.org/shaanu-the-optimist/
2021-10-23 21:03:54,337 INFO: Fetching https://www.cfsindia.org/sabuj-dweper-raja-dweep-ka-rahasya/
2021-10-23 21:03:55,174 INFO: Fetching https://www.cfsindia.org/pappu-ki-pugdandi/
2021-10-23 21:03:56,032 INFO: Fetching https://www.cfsindia.org/pehle-aap/
2021-10-23 21:03:56,849 INFO: Fetching https://www.cfsindia.org/mujhse-dosti-karoge/
2021-10-23 21:03:57,664 INFO: Fetching https://www.cfsindia.org/kimas-lode-b

2021-10-23 21:05:12,198 INFO: Fetching https://www.facebook.com/CFSI.ORG/?ref=page_internal
2021-10-23 21:05:13,013 INFO: Fetching https://www.facebook.com/CFSI.ORG/about/?ref=page_internal
2021-10-23 21:05:13,588 INFO: Fetching https://www.facebook.com/CFSI.ORG/photos/?ref=page_internal
2021-10-23 21:05:15,140 INFO: Fetching https://www.facebook.com/CFSI.ORG/events/?ref=page_internal
2021-10-23 21:05:15,657 INFO: Fetching https://www.facebook.com/CFSI.ORG/videos/?ref=page_internal
2021-10-23 21:05:16,174 INFO: Fetching https://www.facebook.com/CFSI.ORG/posts/?ref=page_internal
2021-10-23 21:05:18,419 INFO: Fetching https://www.facebook.com/CFSI.ORG/community/?ref=page_internal
2021-10-23 21:05:19,744 INFO: Fetching https://www.facebook.com/login/?privacy_mutation_token=eyJ0eXBlIjowLCJjcmVhdGlvbl90aW1lIjoxNjM1MDAzMDA4LCJjYWxsc2l0ZV9pZCI6Mzc4Mzc1MTU5OTY2NjIxfQ%3D%3D&next=https%3A%2F%2Fwww.facebook.com%2FCFSI.ORG%2F
2021-10-23 21:05:20,021 INFO: Fetching https://www.facebook.com/reg/?pri

2021-10-23 21:06:03,195 INFO: Fetching https://www.linkedin.com/legal/copyright-policy?trk=registration_footer-copyright-policy
2021-10-23 21:06:03,759 INFO: Fetching https://brand.linkedin.com/policies?trk=registration_footer-brand-policy
2021-10-23 21:06:05,637 INFO: Fetching https://www.linkedin.com/psettings/guest-controls?trk=registration_footer-guest-controls
2021-10-23 21:06:06,198 INFO: Fetching https://www.linkedin.com/legal/professional-community-policies?trk=registration_footer-community-guide
2021-10-23 21:06:06,736 INFO: Fetching https://www.youtube.com/
2021-10-23 21:06:07,164 INFO: Fetching https://www.youtube.com/about/
2021-10-23 21:06:07,568 INFO: Fetching https://www.youtube.com/about/press/
2021-10-23 21:06:08,406 INFO: Fetching https://www.youtube.com/about/copyright/
2021-10-23 21:06:08,764 INFO: Fetching https://www.youtube.com/t/contact_us/
2021-10-23 21:06:08,967 INFO: Fetching https://www.youtube.com/creators/
2021-10-23 21:06:09,397 INFO: Fetching https://www