# Notebook for anime scrapping from MyAnimeList

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import time
from tqdm import tqdm
import random
import json
from datetime import datetime
import pathlib
import os

In [38]:
class Anime:
  def __init__(self, title, href):
    self.Title = title
    self.href = href
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'}
    response = requests.get(href, headers=headers)
    self.html = BeautifulSoup(response.text, 'html.parser')

  def scrape_anime_id(self):
    self.Id = int(self.href.split('https://myanimelist.net/anime/')[1].split('/')[0])

  def scrape_english_title(self):
    try:
      english_title = float(
          self.html
          .find(class_='title-english title-inherit')
          .get_text(strip=True)
      )
    except:
      english_title = np.nan
    self.English_title = english_title
    return english_title

  def scrape_score(self):
    try:
      score = float(
          self.html
          .find(class_='fl-l score')
          .get_text(strip=True)
      )
    except:
      score = np.nan
    self.Score = score
    return score

  def scrape_users_scoring(self):
    try:
      users_scoring = int(
          self.html
          .find(class_='fl-l score')
          .get('data-user')
          .replace(',', '')
          .replace(' users', '')
          .replace('-', '0')
      )
    except:
      users_scoring = np.nan
    self.Users_scoring = users_scoring
    return users_scoring

  def scrape_rank(self):
    try:
      rank = int(
          self.html
          .find(class_='numbers ranked')
          .get_text(strip=True)
          .replace('Ranked#', '')
      )
    except:
      rank = np.nan
    self.Rank = rank
    return rank

  def scrape_popularity(self):
    try:
      popularity = int(
          self.html
          .find(class_='numbers popularity')
          .get_text(strip=True)
          .replace('Popularity#', '')
      )
    except:
      popularity = np.nan
    self.Popularity = popularity
    return popularity

  def scrape_members(self):
    try:
      members = int(
          self.html
          .find(class_='numbers members')
          .get_text(strip=True)
          .replace('Members', '')
          .replace(',', '')
      )
    except:
      members = np.nan
    self.Members = members
    return members

  def scrape_season(self):
    try:
      season = (
          self.html
          .find(class_='information season')
          .get_text(strip=True)
      )
    except:
      season = np.nan
    self.Season = season
    return season

  def scrape_show_type(self):
    try:
      show_type = (
          self.html
          .find(class_='information type')
          .get_text(strip=True)
      )
    except:
      show_type = np.nan
    self.Show_type = show_type
    return show_type

  def scrape_studio(self):
    try:
      studio = (
          self.html
          .find(class_='information studio author')
          .get_text(strip=True)
      )
    except:
      studio = np.nan
    self.Studio = studio
    return studio

  def scrape_synopsis(self):
    try:
      synopsis = (
          self.html
          .find(itemprop='description')
          .get_text(separator=' ', strip=True)
      )
    except:
      synopsis = np.nan
    self.Synopsis = synopsis
    return synopsis

  def scrape_infos(self):
    infos = self.html.find_all(class_='spaceit_pad')

    matches = ['English:', 'Studios:', 'Episodes:', 'Status:', 'Aired:', 'Duration:', 'Broadcast:', 'Source:', 'Rating:', 'Demographic:']
    infos_dict = {}

    for info in infos:
      txt = info.get_text('\n', strip=True)
      if any(m in txt for m in matches):
        register = txt.replace(':', '').splitlines()
        infos_dict[register[0]] = register[1]

      if info.find(string=re.compile('Genre:|Genres:')) != None:
        genre = [i.get_text(strip=True) for i in info.find_all('a')]
        infos_dict['Genre'] = genre#';'.join(genre)

      if info.find(string=re.compile('Producers:')) != None:
        producer = [i.get_text(strip=True) for i in info.find_all('a')]
        infos_dict['Producers'] = producer#';'.join(producer)

      if info.find(string=re.compile('Theme:|Themes:')) != None:
        infos_dict['Theme'] = info.find('a').get_text(strip=True)

      for k, v in infos_dict.items():
        setattr(self, k, v)

    return infos_dict

  def scrape_all(self):
    self.scrape_anime_id()
    #anime.scrape_english_title()
    self.scrape_score()
    self.scrape_users_scoring()
    self.scrape_rank()
    self.scrape_popularity()
    self.scrape_members()
    self.scrape_season()
    self.scrape_show_type()
    #anime.scrape_studio()
    self.scrape_synopsis()
    self.scrape_infos()

In [39]:
def get_animes_by_season(year, season):
  site = f'https://myanimelist.net/anime/season/{str(year)}/{season}'

  headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'}
  response = requests.get(site, headers=headers)

  html = BeautifulSoup(response.text, 'html.parser')

  animes = html.find_all(class_='h2_anime_title')
  titles = [anime.get_text() for anime in animes]
  hrefs = [anime.find('a').get('href') for anime in animes]

  return titles, hrefs

In [40]:
def scrape_anime(title, href):
    try:
        anime = Anime(title=title, href=href)
        anime.scrape_all()
        data = vars(anime)
        data.pop('html', None)
    except:
        data = np.nan

    return data

In [45]:
def scrape_and_save_animes_by_season(years, seasons):
    for year in years:
        for season in seasons:
            dir = os.path.join(os.getcwd(), f'data\animes\{year}\{season}')
            pathlib.Path(dir).mkdir(parents=True, exist_ok=True)

            print(f'{str(year)} {season}')
            titles, hrefs = get_animes_by_season(year, season)
            print(f'Animes: {len(titles)}')

            for title, href in tqdm(zip(titles, hrefs)):
                data = scrape_anime(title, href)
                if data != np.nan:
                    anime_id = str(data['Id'])
                    data['Scrape_season'] = season
                    data['Scrape_year'] = year
                    data['Scrape_date'] = datetime.today().date().strftime('%Y-%m-%d')
                    with open(f'{dir}\{anime_id}.json', 'w') as f:
                        json.dump(data, f)
                
                time.sleep(random.randint(2,6))

In [46]:
years = range(2020, 2021, 1)
seasons = ['winter', 'spring', 'summer', 'fall']

scrape_and_save_animes_by_season(years, seasons)

2020 winter
Animes: 261


261it [21:05,  4.85s/it]


2020 spring
Animes: 214


214it [17:50,  5.00s/it]


2020 summer
Animes: 257


257it [22:03,  5.15s/it]


2020 fall
Animes: 283


283it [23:49,  5.05s/it]
