# Crawls through the wikipedia pages year by year

In [102]:
import re
import traceback
import requests
from datetime import datetime, timedelta
from collections import namedtuple
from bs4 import BeautifulSoup
import pandas as pd

In [111]:
class CrawlerException(Exception):
    def __init__(self, start, message="Year not in (2001, 2020) range"):
        self.start = start
        self.message = message
        super().__init__(self.message)

In [116]:
class AniCrawler:
    WIKI_URL = "https://en.wikipedia.org/wiki/List_of_animated_feature_films_of"
    def __init__(self, *args):
        if len(args) == 0:
            self._year = str(int(datetime.now().strftime("%Y"))-1)
        elif len(args) == 1:
            self._year = args[0]
        elif len(args)==2:
            try:
                if 2000<args[0]<=2019 and 2000<args[1]<=2019:
                    if args[0]>args[1]:
                        start_year = args[1]
                        end_year = args[0]
                        self._duration = [i for i in range(start_year, end_year)]
                    elif args[0]<args[1]:
                        start_year = args[0]
                        end_year = args[1]
                        self._duration = [i for i in range(start_year, end_year)]                    
                else:
                    self._duration = []
                    raise CrawlerException(args)
            except CrawlerException as e:
                print(e)
        else:
            print("Bad args!")
        self._films = []
       
               
    def duration(self):
        try:
            return self._duration
        except AttributeError:
            return self._year
    
    @property
    def urls(self):
        if self.duration():
            self._urls = list(map(lambda s : "_".join([self.WIKI_URL, str(s)]), self.duration()))
            return self._urls
        return "_".join([self.WIKI_URL, self._year])
    
    @property
    def films(self):
        self._films = self._makeFilms() 
        return self._films  
    
    @staticmethod
    def _requestUrl(url_string):
        try:
            return requests.get(url_string).text
        except Exception:
            print("Request Invalid")
 
    @staticmethod
    def _makeSoup(res):
        try:
            return BeautifulSoup(res, "html.parser")
        except Exception:
            print("Bad Soup")
            
    @staticmethod
    def _getTable(soup):
        return soup.find('table', {'class': 'sortable wikitable'})

    def _makeFilms(self):
        for _ in self.urls:
            current_url = self._requestUrl(_)
            current_soup = self._makeSoup(current_url)
            current_table = self._getTable(current_soup)
            self._films.append(self._getRows(current_table))
        return self._films
#         try:        
#             return self.films(self._getRows(current_table))
#         except Exception:
#             pass
                       
    def _getHeaders(self, tables):
        film_attributes = []
        try:
            rows = tables.find_all('th')
            for row in rows:
                film_attributes.append(self._stringCleanup(row.text))
                return film_attributes
        except AttributeError:
            pass
        return film_attributes
        
    @staticmethod
    def _stringCleanup(my_string):
        try:
            my_string = re.sub('\n','',my_string)
        except Exception:
            pass
        return my_string
    
    
    def _getRows(self, table):
        headers = self._getHeaders(table)
        if len(headers)>0:
            Film = namedtuple('Film', headers[:-1])
            rows = table.find_all('tr')
            for row in rows[1:]:
                cells = row.find_all('td')
                s = Film(cells[0].text, cells[1].text, cells[2].text, cells[3].text, cells[4].text)
                return s
        else:
            print("Unable to Parse table")

In [117]:
sd = AniCrawler(1999, 2007)

print(sd.duration())
# print(sd.films)
# # print(sd.urls)

Year not in (2001, 2020) range


AttributeError: 'AniCrawler' object has no attribute '_year'