In [1]:
import logging
import json
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

logging.basicConfig(format = '%(asctime)s %(levelname)s:%(message)s', level = logging.INFO)

class Crawler:

    def __init__(self, urls = [], limit = 500):
        
        self.visited_urls = []
        self.urls_to_visit = urls
        self.limit = limit
        return

    def download_url(self, url):
        try:
            return requests.get(url).text
        
        except Exception as E:
            return ''

    def get_linked_urls(self, url, html):
        soup = BeautifulSoup(html, 'html.parser')
        
        for link in soup.find_all('a'):
            
            path = link.get('href')
            
            if path and path.startswith('/'):
                path = urljoin(url, path)
            
            yield path

    def add_url_to_visit(self, url):
        total_crawled = len(self.visited_urls) + len(self.urls_to_visit)
        if total_crawled >= self.limit:
            return
        
        if url and url not in self.visited_urls and url not in self.urls_to_visit:
            # Why the if url?
            self.urls_to_visit.append(url)
            return

    def crawl(self, url):
        html = self.download_url(url)
        
        for url in self.get_linked_urls(url, html):
            self.add_url_to_visit(url)

    def run(self):
        while self.urls_to_visit:
            url = self.urls_to_visit.pop(0)
            
            logging.info(f' Fetching {url}')
            
            try:
                self.crawl(url)
            
            except Exception:
                logging.exception(f' Failed to fetch {url}')
            
            finally:
                self.visited_urls.append(url)

if __name__ == '__main__':

    crawler = Crawler(urls = ['https://filmsdivision.org/film-catalogue.html/'])
    crawler.run()

    display = False
    
    if display == True:
        print(crawler.visited_urls, flush = True)

    # Saving visited URLs
    with open('films_division_crawled_URLs_500.json', 'w+') as f:
        json.dump(crawler.visited_urls, f)

# No parallelism
# No retry mechanism
# No URL normalization ~ Homework: Find a python library function that does 'URL normalization'
# Ignores Robots.txt file

2021-10-09 14:38:56,638 INFO: Fetching https://filmsdivision.org/film-catalogue.html/
2021-10-09 14:39:01,555 INFO: Fetching https://filmsdivision.org/wp-login.php?action=lostpassword
2021-10-09 14:39:03,163 INFO: Fetching javascript:void(0);
2021-10-09 14:39:03,165 INFO: Fetching https://filmsdivision.org/home_hi.html
2021-10-09 14:39:05,735 INFO: Fetching https://filmsdivision.org
2021-10-09 14:39:08,609 INFO: Fetching https://filmsdivision.org/
2021-10-09 14:39:10,923 INFO: Fetching https://filmsdivision.org/about-us.html
2021-10-09 14:39:12,607 INFO: Fetching https://filmsdivision.org/activities.html
2021-10-09 14:39:14,252 INFO: Fetching https://filmsdivision.org/gallery.html
2021-10-09 14:39:16,109 INFO: Fetching https://filmsdivision.org/events.html
2021-10-09 14:39:24,913 INFO: Fetching http://miff.in
2021-10-09 14:40:33,619 INFO: Fetching https://filmsdivision.org/nmic.html
2021-10-09 14:40:36,479 INFO: Fetching index.php
2021-10-09 14:40:36,481 INFO: Fetching https://filmsdiv

2021-10-09 14:43:43,805 INFO: Fetching https://filmsdivision.org/shop/siddheshwari-3
2021-10-09 14:43:45,653 INFO: Fetching https://filmsdivision.org/shop/sleeping-cities
2021-10-09 14:43:47,392 INFO: Fetching https://filmsdivision.org/shop/spaces-between-2
2021-10-09 14:43:49,131 INFO: Fetching https://filmsdivision.org/shop/even-red-can-be-sad
2021-10-09 14:43:50,975 INFO: Fetching https://filmsdivision.org/shop/the-last-adieu
2021-10-09 14:43:52,816 INFO: Fetching http://www.youtube.com/embed/9IgoTwzmYYI?rel=0&wmode=transparent
2021-10-09 14:43:53,146 INFO: Fetching https://filmsdivision.org/shop/rangbhoomi
2021-10-09 14:43:54,863 INFO: Fetching http://www.youtube.com/embed/xFf-_IhaIf4?rel=0&wmode=transparent
2021-10-09 14:43:55,199 INFO: Fetching https://filmsdivision.org/%e0%a4%b5%e0%a4%bf%e0%a4%a4%e0%a4%b0%e0%a4%a3.html
2021-10-09 14:43:56,908 INFO: Fetching https://filmsdivision.org/%e0%a4%a5%e0%a4%bf%e0%a4%8f%e0%a4%9f%e0%a4%b0-%e0%a4%b0%e0%a4%bf%e0%a4%b2%e0%a5%80%e0%a5%9b.html


2021-10-09 14:47:45,616 INFO: Fetching https://filmsdivision.org/wp-content/gallery/new-gallery/FDS_5084c.jpg
2021-10-09 14:47:52,132 INFO: Fetching https://filmsdivision.org/wp-content/gallery/new-gallery/FDS_5077c.jpg
2021-10-09 14:48:01,733 INFO: Fetching https://filmsdivision.org/wp-content/gallery/new-gallery/FDS_5069c.jpg
2021-10-09 14:48:05,024 INFO: Fetching https://filmsdivision.org/wp-content/gallery/new-gallery/FDS_4746.jpg
2021-10-09 14:48:09,783 INFO: Fetching https://filmsdivision.org/wp-content/gallery/new-gallery/FDS_4744.jpg
2021-10-09 14:48:16,322 INFO: Fetching https://filmsdivision.org/wp-content/gallery/new-gallery/FDS_4734.jpg
2021-10-09 14:48:22,971 INFO: Fetching https://filmsdivision.org/wp-content/gallery/new-gallery/FDS_4508.jpg
2021-10-09 14:48:27,218 INFO: Fetching https://filmsdivision.org/wp-content/gallery/new-gallery/FDS_4456.jpg
2021-10-09 14:48:32,147 INFO: Fetching https://filmsdivision.org/wp-content/gallery/new-gallery/FDS_4445.jpg
2021-10-09 14:48

2021-10-09 15:01:11,507 INFO: Fetching https://miff.in/prism-national/
2021-10-09 15:01:13,039 INFO: Fetching https://miff.in/prism-international/
2021-10-09 15:01:14,377 INFO: Fetching https://miff.in/national-competition/
2021-10-09 15:01:16,816 INFO: Fetching https://miff.in/international-competition/
2021-10-09 15:01:19,382 INFO: Fetching https://miff.in/audi-i/
2021-10-09 15:01:20,709 INFO: Fetching https://miff.in/audi-ii/
2021-10-09 15:01:22,348 INFO: Fetching https://miff.in/j-b-hall/
2021-10-09 15:01:23,845 INFO: Fetching https://miff.in/rr-ii/
2021-10-09 15:01:25,624 INFO: Fetching https://miff.in/rr-iii/
2021-10-09 15:01:26,952 INFO: Fetching https://miff.in/screening/audi-i/
2021-10-09 15:01:28,348 INFO: Fetching https://miff.in/screening/screening-programme-day-wise/
2021-10-09 15:01:29,722 INFO: Fetching https://miff.in/competition/catalogue-2020/
2021-10-09 15:01:31,044 INFO: Fetching https://miff.in/catalogue/
2021-10-09 15:01:32,382 INFO: Fetching https://miff.in/catal

2021-10-09 15:08:38,776 INFO: Fetching https://filmsdivision.org/category/tender-notices/page/2
2021-10-09 15:08:40,626 INFO: Fetching https://filmsdivision.org/category/tender-notices
2021-10-09 15:08:42,262 INFO: Fetching https://filmsdivision.org/the-fd-zone/27-september-2019.html
2021-10-09 15:08:44,002 INFO: Fetching https://filmsdivision.org/the-fd-zone/13-september-2019.html
2021-10-09 15:08:45,746 INFO: Fetching https://filmsdivision.org/the-fd-zone/23rd-august-2019.html
2021-10-09 15:08:47,420 INFO: Fetching https://filmsdivision.org/the-fd-zone/9-august-2019.html
2021-10-09 15:08:49,232 INFO: Fetching https://filmsdivision.org/the-fd-zone/26-july-2019-2.html
2021-10-09 15:08:50,988 INFO: Fetching https://filmsdivision.org/the-fd-zone/26-july-2019.html
2021-10-09 15:08:52,726 INFO: Fetching https://filmsdivision.org/the-fd-zone/12-july-2019.html
2021-10-09 15:08:54,456 INFO: Fetching https://filmsdivision.org/the-fd-zone/1-dollar-city-2-night-life-3-note-from-the-crematorium.h

2021-10-09 15:10:36,250 INFO: Fetching Tel:044
2021-10-09 15:10:36,250 INFO: Fetching mailto:fdchennai@filmsdivision.org
2021-10-09 15:10:36,251 INFO: Fetching mailto:fdhyderabad@filmsdivision.org
2021-10-09 15:10:36,252 INFO: Fetching Tel:0866
2021-10-09 15:10:36,252 INFO: Fetching mailto:fdvijaywada@filmsdivision.org
2021-10-09 15:10:36,253 INFO: Fetching Tel:033
2021-10-09 15:10:36,254 INFO: Fetching mailto:fdkolkata@filmsdivision.org
2021-10-09 15:10:36,254 INFO: Fetching mailto:Fdtvpuram1847@bsnl.in
2021-10-09 15:10:36,255 INFO: Fetching Tel:022-23551350
2021-10-09 15:10:36,256 INFO: Fetching mailto:fdnagpur@filmsdivision.org
2021-10-09 15:10:36,257 INFO: Fetching mailto:fdlucknow@filmsdivision.org
2021-10-09 15:10:36,258 INFO: Fetching https://filmsdivision.org/citizens-charter.html
2021-10-09 15:10:38,117 INFO: Fetching https://filmsdivision.org/film-catalogue.html
2021-10-09 15:10:40,533 INFO: Fetching https://filmsdivision.org/wp-content/uploads/2020/04/ANNEXURE_-_A__FILMS_COM

2021-10-09 15:13:53,957 INFO: Fetching https://www.facebook.com/DOKLeipzig/
2021-10-09 15:13:56,502 INFO: Fetching https://www.facebook.com/goetheinstitut.bruessel/
2021-10-09 15:13:58,248 INFO: Fetching https://www.facebook.com/pages/Documentary-Channel-Canada/105607016140506
2021-10-09 15:13:59,253 INFO: Fetching https://www.facebook.com/FiftHeadFactory/
2021-10-09 15:14:00,794 INFO: Fetching https://www.facebook.com/BritishFilmInstitute/
2021-10-09 15:14:02,707 INFO: Fetching https://www.facebook.com/WorldSkillsLeipzig2013/
2021-10-09 15:14:05,400 INFO: Fetching https://www.facebook.com/getyourcrowd/
2021-10-09 15:14:07,568 INFO: Fetching https://www.facebook.com/DocWok/
2021-10-09 15:14:10,214 INFO: Fetching https://www.facebook.com/filmmakersalliance/
2021-10-09 15:14:12,059 INFO: Fetching https://www.facebook.com/Wüstenschiff-Forum-188449511203544/
2021-10-09 15:14:14,722 INFO: Fetching https://www.facebook.com/oxfamindia/
2021-10-09 15:14:16,863 INFO: Fetching https://www.facebo