In [105]:
import requests
from bs4 import BeautifulSoup

In [106]:
def check_page_not_found(soup):
    return soup.find(class_='ah_404') is not None

In [107]:
def fetch_soup(url):
    try: 
        response = requests.get(url)
        response.raise_for_status()
        page_soup = BeautifulSoup(response.content, 'html.parser')
        
        if check_page_not_found(page_soup):
            print('Fail - Page Not Found 404')
            return None
        
        print('Success')
        return page_soup
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

In [108]:
def save_soup(soup, file_name):
    with open(file_name + '.html', 'w', encoding='utf-8') as f:
        f.write(str(soup))

In [109]:
def fetch_movie_title(soup):
    title = soup.find('h1', {"class": "heading_movie"}).text if soup.find('h1', {"class": "heading_movie"}) else 'N/A'
    return title

In [110]:
def fetch_movie_genre(soup):
    genre = soup.find('div', {"class": "list_cate"}).text if soup.find('div', {"class": "list_cate"}) else 'N/A'
    genre_list = genre.splitlines()
    clean_genre_list = [g.strip() for g in genre_list if len(g.strip()) > 0][1:]
    return clean_genre_list

In [111]:
def fetch_movie_rating(soup):
    rating = soup.find('div', {"class": "score"}).text if soup.find('div', {"class": "score"}) else 'N/A'
    if rating != 'N/A':
        rating_parts = rating.split()
        if len(rating_parts) >= 4:
            return [rating_parts[1], rating_parts[3]]
        else:
            return 'Invalid rating format'
    else:
        return rating

In [112]:
def fetch_movie_information(soup):
    title = fetch_movie_title(soup)
    genre = fetch_movie_genre(soup)
    rating = fetch_movie_rating(soup)
    movie = {
        'title': title,
        'genre': genre,
        'rating': rating
    }
    
    return movie

In [113]:
def fetch_pages(start_page = 1, end_page = 3):
    current_page = start_page
    film_list = []
    while current_page <= end_page:
        url = f"https://animehay.biz/the-loai/anime-{current_page}.html"
        print(f"Processing on page: ", url)
        
        soup = fetch_soup(url)
        if soup is None:
            break
            
        film_list.append(soup)
        current_page += 1
        print('\t----------\t----------\t---------\t----------\t----------\t---------')
        
    return film_list

In [114]:
all_soups = fetch_pages()

Processing on page:  https://animehay.biz/the-loai/anime-1.html
Success
	----------	----------	---------	----------	----------	---------
Processing on page:  https://animehay.biz/the-loai/anime-2.html
Success
	----------	----------	---------	----------	----------	---------
Processing on page:  https://animehay.biz/the-loai/anime-3.html
Success
	----------	----------	---------	----------	----------	---------


In [115]:
def get_all_movies_link(soup):
    links = soup.find_all('a', href = True)
    specific_links = [link['href'] for link in links if "thong-tin-phim" in link['href']]
    return specific_links