In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [12]:
class MovieSpider:
   
    def __init__(self, url_pattern, pages_to_scrape=25, sleep_interval=-1, content_parser=None):
        self.url_pattern = url_pattern
        self.pages_to_scrape = pages_to_scrape
        self.sleep_interval = sleep_interval
        self.content_parser = content_parser
    
    def scrape_url(self, url):
                
        try:
            response = requests.get(url, timeout=10)
        except requests.exceptions.Timeout:
            print("Timeout error")
        except requests.exceptions.TooManyRedirects:
            print("Redirect error")
        except requests.exceptions.SSLError:
            print("SSL error")
        except requests.exceptions.RequestException as e:
            print("Unknown error")
        
        if response.status_code >= 400 and response.status_code < 500:
            print('Request failed because the resource either does not exist or is forbidden')
        elif response.status_code >= 300:
            print('Request failed because the response server encountered an error')
        
        return self.content_parser(response.content)
        
    def kickstart(self):
        
        output = []
        
        for i in range(1, self.pages_to_scrape+1):
            
            if self.sleep_interval > 0:
                    time.sleep(self.sleep_interval)
            
            output.append(self.scrape_url(self.url_pattern % i))
            exit = self.scrape_url(self.url_pattern % i)
            if exit == 0:
                print("No more pages to scrape")
                break
        
        return output

URL_PATTERN = 'https://www.listchallenges.com/top-1000-greatest-movies-of-all-time-by-imdb/checklist/%s/' 

def titles_parser(content):
    
    soup = BeautifulSoup(content)
    
    text = [element.text for element in soup.find_all('div', {"class":"item-name"})]
    
    
    clean_names = []

    for element in text:
        clean_names.append(element.replace("\t", "").replace("\r", "").replace("\n ", "").strip('\n'))
        
    return clean_names

my_spider = MovieSpider(URL_PATTERN, 25, content_parser=titles_parser)
movies = [item for sublist in my_spider.kickstart() for item in sublist]
movies


['The Shawshank Redemption (1994)',
 'The Godfather (1972)',
 'The Godfather: Part II (1974)',
 'The Dark Knight (2008)',
 'The Good, the Bad and the Ugly (1966)',
 'Pulp Fiction (1994)',
 'The Lord of the Rings: The Return of the King (2003)',
 '12 Angry Men (1957)',
 "Schindler's List (1993)",
 'Fight Club (1999)',
 'The Lord of the Rings: The Fellowship of the Ring (2001)',
 'Inception (2010)',
 'The Empire Strikes Back (1980)',
 'Forrest Gump (1994)',
 "One Flew Over the Cuckoo's Nest (1975)",
 'Seven Samurai (1954)',
 'The Lord of the Rings: The Two Towers (2002)',
 'Goodfellas (1990)',
 'Star Wars (1977)',
 'The Matrix (1999)',
 'City of God (2002)',
 'Once Upon a Time in the West (1968)',
 "It's a Wonderful Life (1946)",
 'The Usual Suspects (1995)',
 'Se7en (1995)',
 'City Lights (1931)',
 'Leon: The Professional (1994)',
 'Casablanca (1942)',
 'The Silence of the Lambs (1991)',
 'Life Is Beautiful (1997)',
 'The Intouchables (2011)',
 'Spirited Away (2001)',
 'Raiders of the L