## MyAnimeList Web Scraping Tool

The purpose of this program is to extract data related to anime from MyAnimeList. This data includes details like the anime's title, genre, and rating, along with reviews and scores provided by members of the platform. The anime list that the program uses as its source is taken from the <a href="https://myanimelist.net/topanime.php">"Top Anime"</a> section of MyAnimeList.

**Note**: There is limitation to this program, and it will stop if an error is encountered after making the maximum allowed attempts to connect to MyAnimeList. To reduce the likelihood of errors, web scraping delays have been implemented. It is possible to extend the duration of the delay by modifying the **"getSecondsDelay(page_num)"** function. Alternatively, using proxies could potentially resolve this issue.

In [37]:
import requests
import re
import csv
import time
import random
import pandas as pd
import numpy as np
import warnings

from bs4 import BeautifulSoup

warnings.filterwarnings("ignore")

Definition of functions for saving file, web scraping anime info and reviews

In [38]:
ANIME_INFO_FILENAME = "AnimeInfo.csv"
ANIME_REVIEWS_FILENAME = "AnimeReviews.csv"


def saveAnimeInfo(anime_info_list, filename):
    """
    The function that saved the anime information/reviews in a file
    :param: list anime_info_list: List of anime information/reviews
    :param str filemame: Name of the file
    """
    
    filepath = "./Dataset/" + filename
    try:
        with open(filepath, 'a', newline='', encoding="utf-16") as csvfile:

            fieldnames = list(anime_info_list[0].keys())
            writer = csv.DictWriter(csvfile, delimiter=",", lineterminator="\n", fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC)

            if csvfile.tell() == 0:
                writer.writeheader()

            for anime_info in anime_info_list:
                writer.writerow(anime_info)

            print(f"Saved {len(anime_info_list)} records")
                
    except Exception as err:
        print(f"Error saving the file: {err}")


def getSecondsDelay(page_num):
    """
    The function that generates random time in seconds that is used to add delay to the execution of the program
    :param page: Page number
    :return str seconds: Random time in seconds
    """

    return random.randint(0, 10) + (page_num % 10)


def sanitizeAnimeInfo(field_name, info):
    """
    The function that removes any extraneous details or characters from the data extracted by the scraping process
    :param str field_name: Name of the field/feature
    :info str info: Web scraped information
    :return str info: Filtered information
    """
    
    if field_name in ['genres', 'themes', 'demographic']:
        info_split = set(re.split('\W+', info))
        info = ", ".join(info_split)

    if field_name in ['producers', 'licensors']:
        info = info.replace("|,|", ", ")

    if field_name in ['ranked', 'popularity']:
        info = info[1:]

    if field_name in ['ranked', 'score']:
        info = info[:info.find("|")]
    
    if 'None found' in info:
        info = ''

    return info


def scrapeAnimeInfo(url):
    """
    The function that web scraped anime information and stats from MyAnimeList
    :param str url: Link of the anime page
    :return dict anime_info: Key-value pair containing anime information and stats
    """

    # Web scrape anime info on the left panel
    anime_info_key = ('anime_id', 'title', 'synonyms', 'japanese', 'english', 'type', 'episodes', 'status', 'aired', 'premiered', 'producers', 
                      'licensors', 'studios', 'source', 'genres', 'themes', 'demographic', 'duration', 'rating', 'score', 
                      'ranked', 'popularity', 'members', 'favorites')
    anime_info = dict.fromkeys(anime_info_key, "")    

    page = requests.get(url + "/stats")
    soup = BeautifulSoup(page.content, "html.parser")
    
    anime_info['anime_id'] = soup.find("input", { "type" : "hidden", "name" : "aid" })['value']
    anime_info['title'] = soup.find("h1", class_="title-name").get_text(strip=True)

    print(f"- {anime_info['title']}")
    
    left_panel = soup.find("div", class_="leftside")
    left_info = left_panel.find_all("div", class_="spaceit_pad")

    for elem in left_info:
        info = elem.get_text("|", strip=True)
        delimeter = info.find("|")

        if delimeter != -1:
            # Separate field name from info
            field_name = info[:(delimeter-1)].lower()
            info = info[(delimeter+1):]

            if field_name not in anime_info_key:
                continue

            info = sanitizeAnimeInfo(field_name, info)

            anime_info[field_name] = info

    # Web scape anime stats on right panel
    anime_stats_key = ('watching', 'completed', 'on-hold', 'dropped', 'plan_to_watch')
    anime_stats = dict.fromkeys(anime_stats_key, "") 

    right_panel = soup.find("div", class_="rightside")
    right_stats = right_panel.find_all("div", class_="spaceit_pad")

    for elem in right_stats:
        info = elem.get_text("|", strip=True)
        delimeter = info.find("|")

        if delimeter != -1:
            # Separate field name from info
            field_name = info[:(delimeter-1)].replace(" ", "_").lower()
            info = info[(delimeter+1):]

            if field_name == 'total':
                break
            else:
                anime_stats[field_name] = info
    
    anime_info.update(anime_stats)
    
    return anime_info


def scrapeAnimeReviews(anime_id, url):
    """
    The function that web scraped anime reviews from MyAnimeList
    :param int anime_id: Anime ID
    :param str url: Link of the anime page
    """

    page_num = 1
    has_more_reviews = True

    while has_more_reviews:
        anime_reviews_list = []

        url = url + "/*/reviews?sort=suggested&filter_check=&filter_hide=&preliminary=on&spoiler=off&p=" + str(page_num)
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")

        reviews = soup.find_all("div", class_="review-element")
        for elem in reviews:
            anime_review = {}
            anime_review['anime_id'] = anime_id
            anime_review['username'] = elem.find("div", "username").get_text(strip=True)
            
            rating = elem.find("div", "rating").get_text(strip=True)
            anime_review['rating'] = rating[(rating.find(":")+1):]    
            anime_review['review'] = elem.find("div", "text").get_text(strip=True)
        
            anime_reviews_list.append(anime_review)

        print(f"Page No.: {page_num} - {len(anime_reviews_list)} reviews")
        
        if len(anime_reviews_list) != 0:
            
            saveAnimeInfo(anime_reviews_list, ANIME_REVIEWS_FILENAME)

            prev_next_page = soup.find("div", class_="ml4 mb8").get_text(strip=True)
            
            if 'More Reviews' not in prev_next_page:
                has_more_reviews = False
            else:
                page_num = page_num + 1
        else:
            has_more_reviews = False

    
        time.sleep(getSecondsDelay(page_num))

Perform web scraping to extract anime information. The list is based from the Top Anime from <a href="https://myanimelist.net/">MyAnimeList</a> page. 

In [39]:
HTTP_RESPONSE_OK = 200
status_code = HTTP_RESPONSE_OK

# NOTE: In case the program encounters an error and stops, this field can be modified to a pagination value 
# where the error occurred, allowing the process to resume. The pagination number can be determined by referring 
# to the logs provided below. 
page_num = 0 

while status_code == HTTP_RESPONSE_OK:
    anime_info_list = []

    print(f"\nPagination Number: {page_num}")

    url = 'https://myanimelist.net/topanime.php?limit=' + str(page_num)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

    anime_url_list = soup.find_all("h3", class_="hoverinfo_trigger fl-l fs14 fw-b anime_ranking_h3")

    for elem in anime_url_list:
        anime_url = elem.find("a").get("href")
        anime_info = scrapeAnimeInfo(anime_url)
        anime_info_list.append(anime_info)

        time.sleep(getSecondsDelay(page_num))
        
    page_num = page_num + 50
    status_code = page.status_code

    saveAnimeInfo(anime_info_list, ANIME_INFO_FILENAME)


Pagination Number: 0
- Shingeki no Kyojin: The Final Season - Kanketsu-hen
Saved 1 records


Open the anime information dataset created earlier. The IDs contained within it will be utilized to acquire anime reviews. This retrieval process has been separated from the retrieval of anime information because MyAnimeList generates errors if the maximum number of connections is exceeded.

In [43]:
filepath = "./Dataset/" + ANIME_INFO_FILENAME
anime_info_df = pd.read_csv(filepath, encoding='utf-16', thousands=",")
anime_info_df.head(5)

Unnamed: 0,anime_id,title,synonyms,japanese,english,type,episodes,status,aired,premiered,...,score,ranked,popularity,members,favorites,watching,completed,on-hold,dropped,plan_to_watch
0,51535,Shingeki no Kyojin: The Final Season - Kankets...,"Shingeki no Kyojin: The Final Season Part 3, S...",進撃の巨人 The Final Season完結編,Attack on Titan: Final Season - The Final Chap...,Special,2,Currently Airing,"Mar 4, 2023 to 2023",,...,9.13,1,568,368525,7288,175801,345,15277,823,176279


Perform web scraping to extract user reviews and the corresponding scores for each anime

In [41]:
url_base = "https://myanimelist.net/anime/"

# NOTE: In case the program encounters an error and stops, this field can be modified to anime index where the error 
# occurred, allowing the process to resume. The index can be determined by referring to the logs provided below.
# Before re-running the program, check first the file and remove all reviews related to anime that has failed. 
anime_idx_start = 0

anime_df = anime_info_df[['anime_id', 'title']]
anime_df = anime_df.loc[anime_idx_start:,]

for idx, anime_id in enumerate(np.array(anime_df['anime_id'])):
    anime_title = np.array(anime_df['title'])[idx]
    anime_idx = np.array(anime_df.index)[idx]

    print(f"\n{anime_idx}. {anime_title} ({anime_id})")
    
    url_full = url_base + str(anime_id)
    scrapeAnimeReviews(anime_id, url_full)


0. Shingeki no Kyojin: The Final Season - Kanketsu-hen (51535)
Page No.: 1 - 20 reviews
Saved 20 records
Page No.: 2 - 20 reviews
Saved 20 records
Page No.: 3 - 17 reviews
Saved 17 records

1. Shingeki no Kyojin: The Final Season - Kanketsu-hen (51535)
Page No.: 1 - 20 reviews
Saved 20 records
Page No.: 2 - 20 reviews
Saved 20 records
Page No.: 3 - 17 reviews
Saved 17 records


Open the anime reviews dataset created earlier

In [44]:
filepath = "./Dataset/" + ANIME_REVIEWS_FILENAME
anime_info_df = pd.read_csv(filepath, encoding='utf-16', thousands=",")
anime_info_df.head(5)

Unnamed: 0,anime_id,username,rating,review
0,51535,NineTnk,10,This is what peak storytelling meet peak adapt...
1,51535,I_am_free_950,10,"Hajime Isayama, I have no doubt that he is a p..."
2,51535,Doofenheimer,10,The pinnacle of shonen has returned. Attack On...
3,51535,AnimeOdyssey,2,"As a fan of the anime medium, it is with great..."
4,51535,Mecopterraaa,10,"It was a masterpiece, the animation quality, t..."
