In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
def check_page_not_found(soup):
    return soup.find({"class": "ah_404"}) is not None

In [4]:
def fetch_soup(url):
    try: 
        response = requests.get(url)
        response.raise_for_status()
        page_soup = BeautifulSoup(response.content, 'html.parser')
        
        if check_page_not_found(page_soup):
            print('Fail - Page Not Found 404')
            return None
    
        return page_soup
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

In [5]:
def save_soup(soup, file_name):
    with open(file_name + '.html', 'w', encoding='utf-8') as f:
        f.write(str(soup))

In [6]:
def fetch_movie_title(soup):
    title = soup.find('h1', {"class": "heading_movie"}).text if soup.find('h1', {"class": "heading_movie"}) else 'N/A'
    return title

In [1]:
def fetch_movie_description(soup):
    description = soup.find('div', {"class": "desc ah-frame-bg"}).find('p').text if soup.find('div', {"class": "desc ah-frame-bg"}) and soup.find('div', {"class": "desc ah-frame-bg"}).find('p') else 'N/A'
    return description

In [9]:
def fetch_movie_genre(soup):
    genre = soup.find('div', {"class": "list_cate"}).text if soup.find('div', {"class": "list_cate"}) else 'N/A'
    genre_list = genre.splitlines()
    clean_genre_list = [g.strip() for g in genre_list if len(g.strip()) > 0][1:]
    return clean_genre_list

In [None]:
def fetch_movie_status(soup):
    status = soup.find('div', {"class": "status"})
    if status:
        status = status.find_all('div')[-1].text.strip()
    else:
        status = 'Undefined'
    return status

In [None]:
def fetch_movie_episodes(soup):     
    ep = soup.find('div', {'class':'list-item-episode scroll-bar'}).text if soup.find('div', {'class':'list-item-episode scroll-bar'}) else 'N/A'
    ep_list = ep.split()
    return len(ep_list)

In [None]:
def fetch_movie_release_year(soup):
    release_year = soup.find('div', {"class": "update_time"})
    if release_year:
        release_year = release_year.find_all('div')[-1].text.strip()
    else:
        release_year = 'N/A'
    return release_year

In [10]:
def fetch_movie_rating(soup):
    rating = soup.find('div', {"class": "score"}).text if soup.find('div', {"class": "score"}) else 'N/A'
    if rating != 'N/A':
        rating_parts = rating.split()
        if len(rating_parts) >= 4:
            return [rating_parts[1], rating_parts[3]]
        else:
            return 'Invalid rating format'
    else:
        return rating

In [7]:
def fetch_movie_information(soup):
    title = fetch_movie_title(soup)
    genre = fetch_movie_genre(soup)
    rating = fetch_movie_rating(soup)
    description = fetch_movie_description(soup)
    status = fetch_movie_status(soup)
    episodes = fetch_movie_episodes(soup)
    release_year = fetch_movie_release_year(soup)
    movie = {
        'title': title,
        'genre': genre,
        'rating': rating,
        'description': description,
        'status': status,
        'episodes': episodes,
        'release year': release_year
    }

    return movie

In [9]:
def fetch_pages(start_page = 1, end_page = 3):
    current_page = start_page
    film_list = []
    while current_page <= end_page:
        url = f"https://animehay.biz/the-loai/anime-{current_page}.html"
        print(f"Processing on page: ", url)
        
        soup = fetch_soup(url)
        if soup is None:
            break
            
        film_list.append(soup)
        current_page += 1
        print('\t----------\t----------\t---------\t----------\t----------\t---------')
        
    return film_list

In [10]:
all_soups = fetch_pages()

Processing on page:  https://animehay.biz/the-loai/anime-1.html
	----------	----------	---------	----------	----------	---------
Processing on page:  https://animehay.biz/the-loai/anime-2.html
	----------	----------	---------	----------	----------	---------
Processing on page:  https://animehay.biz/the-loai/anime-3.html
	----------	----------	---------	----------	----------	---------


In [11]:
def get_all_movies_links(soup):
    links = soup.find_all('a', href = True)
    specific_links = [link['href'] for link in links if "thong-tin-phim" in link['href']]
    return specific_links

In [12]:
all_links = []
for soup in all_soups:
    specific_links_soup = get_all_movies_links(soup)
    all_links.extend(specific_links_soup)

In [13]:
def fetch_all_movies(list_links, save = False):
    all_movies = {}
    for link in list_links:
        try:
            print(f"Processing link: {link}")
            movie_soup = fetch_soup(link)
            if not movie_soup:
                print(f"Failed to fetch data from {link}")
                continue
                
            information = fetch_movie_information(movie_soup)
            
            if save:
                save_soup(movie_soup, information['title'])
                
            all_movies[information['title']] = information
            
            print(f"Successfully processed: {information['title']}")
            print('----------\t---------\t----------')
        
        except Exception as ex:
            print(f"Error while processing link {link}: {ex}")
            
    return all_movies

In [14]:
all_movies = fetch_all_movies(all_links)

Processing link: https://animehay.biz/thong-tin-phim/makeine-too-many-losing-heroines-4117.html
Successfully processed: Makeine: Too Many Losing Heroines!
----------	---------	----------
Processing link: https://animehay.biz/thong-tin-phim/shoshimin-how-to-become-ordinary-4106.html
Successfully processed: Shoshimin: How to Become Ordinary
----------	---------	----------
Processing link: https://animehay.biz/thong-tin-phim/atri-my-dear-moments-4116.html
Successfully processed: Atri: My Dear Moments
----------	---------	----------
Processing link: https://animehay.biz/thong-tin-phim/pokemon-horizons-the-series-3774.html
Successfully processed: Pokémon Horizons: The Series
----------	---------	----------
Processing link: https://animehay.biz/thong-tin-phim/one-piece-34.html
Successfully processed: One Piece
----------	---------	----------
Processing link: https://animehay.biz/thong-tin-phim/giji-harem-4094.html
Successfully processed: Giji Harem
----------	---------	----------
Processing 

In [15]:
import json
def save_json(data, filename):
    with open(filename + '.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [16]:
save_json(all_movies, 'data_movies')

In [17]:
def print_json(filename):
    with open(filename + '.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
        print(json.dumps(data, ensure_ascii=False, indent=4))

print_json('data_movies')

{
    "Makeine: Too Many Losing Heroines!": {
        "title": "Makeine: Too Many Losing Heroines!",
        "genre": [
            "Anime",
            "Tình cảm",
            "Học đường",
            "Đời thường"
        ],
        "rating": [
            "8.7",
            "99"
        ]
    },
    "Shoshimin: How to Become Ordinary": {
        "title": "Shoshimin: How to Become Ordinary",
        "genre": [
            "Anime",
            "Học đường",
            "Đời thường"
        ],
        "rating": [
            "7.3",
            "47"
        ]
    },
    "Atri: My Dear Moments": {
        "title": "Atri: My Dear Moments",
        "genre": [
            "Anime",
            "Tình cảm",
            "Drama"
        ],
        "rating": [
            "7.5",
            "41"
        ]
    },
    "Pokémon Horizons: The Series": {
        "title": "Pokémon Horizons: The Series",
        "genre": [
            "Anime",
            "Hành động",
            "Hài hước",
            "

In [18]:
import pandas as pd


def json_to_csv(json_filename, csv_filename):
    with open(json_filename + '.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    

    df = pd.DataFrame.from_dict(data, orient='index')
    

    df.to_csv(csv_filename + '.csv', encoding='utf-8', index=False)


json_to_csv('data_movies', 'movies_data')

print("Dữ liệu đã được xuất ra file 'movies_data.csv'")

Dữ liệu đã được xuất ra file 'movies_data.csv'


Lấy mẫu 30 bộ phim từ data đã có

In [12]:
import pandas as pd
file_path = "data\\movies_data.csv" 
df = pd.read_csv(file_path, encoding='utf-8', on_bad_lines='skip')
sample = df.sample(30)
sample.to_csv('sample.csv', encoding = 'utf-8', index = False)
