In [124]:
import pickle
import numpy as np
import pandas as pd
import json
import sqlalchemy as sql
from sqlalchemy import create_engine
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
from io import StringIO 
import time
import re
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool
import random
from urllib.parse import quote

In [2]:
with open('../tools/credentials.json') as file:
    credentials = json.load(file)
    
username = credentials["dblogin"]["username"]
password = credentials["dblogin"]["password"]

In [3]:
db_string = f"postgresql://{username}:{password}@192.168.0.3:5432/animeplanet"
db = create_engine(db_string)

In [4]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

### Get Anime List

In [5]:
print('scraping anime list...')

scraping anime list...


In [6]:
base_url = 'https://www.anime-planet.com/anime/top-anime?page='

url = f'{base_url}{1}'
resp = requests.get(f'http://192.168.0.3:5000/special-requests?url={quote(url)}')
soup = BeautifulSoup(resp.text, 'html.parser')
ul = soup.find('ul', attrs={'class':'nav'})

In [7]:
page_nums = []
for tag in ul.find_all('a'):
    try:
        page_nums.append(int(tag.text))
    except:
        continue
        
num_pages = max(page_nums)

urls = [f'{base_url}{i}' for i in range(1, num_pages+1)]

In [8]:
def scrapeTable(url):
    resp = requests.get(f'http://192.168.0.3:5000/special-requests?url={quote(url)}')
    if resp.text != '':
        soup = BeautifulSoup(resp.text, 'html.parser')
        table = soup.find('table')
        chunk = pd.read_html(StringIO(str(table)), index_col='Rank')[0][['Title', 'Type', 'Year']]
        chunk['url'] = [np.where(tag.has_attr('href'), 
                           'https://www.anime-planet.com' + tag.get('href'), 
                           'no link') for tag in table.find_all('a')]
        chunk.columns = [col.lower() for col in chunk.columns]
        chunk['url'] = chunk['url'].astype('string')
        return chunk
    else:
        return scrapeTable(url)

In [9]:
chunksize = 10
df = pd.DataFrame()

url_chunks = chunker(urls, chunksize)

for idx, url_chunk in enumerate(tqdm(url_chunks, total=int(len(urls)/chunksize)+1), 1):
    with ThreadPoolExecutor(max_workers=chunksize) as executor:
        chunk = pd.concat(list(executor.map(scrapeTable, url_chunk)), ignore_index=True)
        
    df = pd.concat([df, chunk], ignore_index=True)
 
    time.sleep(max(min(np.random.poisson(2), 5), 1))

100%|██████████| 50/50 [04:08<00:00,  4.96s/it]


In [10]:
df = df.drop_duplicates(['url'], ignore_index=True)

In [12]:
print('saving data to file...')
df.to_csv('../data/anime_list.csv.xz', index=False)

with db.connect() as con:
    print('removing from db...')
    query = f"""DELETE FROM anime;"""
    con.execute(sql.text(query))
    
    print('saving data to db...')
    df.to_sql('anime', con, if_exists='append', index=False, method='multi')

saving data to file...
removing from db...
saving data to db...


### Scrape Anime Pages

In [21]:
print('scraping anime pages...')

scraping anime pages...


In [22]:
df = pd.read_sql('anime', db)

In [23]:
def getPage(url, attempt=1):
    if attempt == 4:
        return (url, '')
    resp = requests.get(f'http://192.168.0.3:5000/special-requests?url={quote(url)}')
    return (url, resp.text) if resp.text != '' else getPage(url, attempt+1)

In [24]:
chunksize = 10

url_list = df['url'].to_list()
url_chunks = chunker(url_list, chunksize)

url_html_dict = {}
for url_chunk in tqdm(url_chunks, total=int(len(url_list)/chunksize)+1):
    with ThreadPoolExecutor(max_workers=chunksize) as executor:
        list_of_tup = list(executor.map(getPage, url_chunk))
        for tup in list_of_tup:
            url_html_dict[tup[0]] = tup[1]
            
    time.sleep(max(min(np.random.poisson(10), 30), 4))

100%|██████████| 1713/1713 [5:36:33<00:00, 11.79s/it]  


In [25]:
df['html_text'] = df['url'].map(url_html_dict)

In [26]:
print('saving data to file...')
df.to_csv('../data/anime_list_html.csv.xz', index=False)

with db.connect() as con:
    print('removing from db...')
    query = f"""DELETE FROM web_scrape 
                WHERE url in ({str(df['url'].to_list())[1:-1]})"""
    con.execute(sql.text(query))
    print('saving data to db...')
    chunks = chunker(df[['url', 'html_text']], 1000)
    for chunk in tqdm(chunks):
        chunk.to_sql('web_scrape', con, if_exists='append', index=False, method='multi')

saving data to file...
removing from db...


0it [00:00, ?it/s]

saving data to db...


18it [02:59,  9.99s/it]


### Extracting addition info

In [79]:
with open('../data/test.html', 'w') as file:
    file.write(df.loc[df['title'] == 'Fullmetal Alchemist: Brotherhood', 'html_text'].iloc[0])

In [148]:
def parseInfo(html):
    soup = BeautifulSoup(html)
    title = soup.find('h1', {'itemprop':'name'}).text

    section = soup.find(attrs={'class': 'pure-g entryBar'})
    num_eps = section.find('span', {'class':'type'})
    if num_eps:
        num_eps = num_eps.text.replace('\n', ' ').strip()
    else:
        num_eps = None
    
    studio = section.find('a', {'href': re.compile(r'/anime/studios/.*')})
    if studio:
        studio = studio.text
    else:
        studio = None
    
    season_year = section.find('a', {'href': re.compile(r'/anime/seasons/.*')})
    if season_year:
        season_year = season_year.text
    else:
        season_year = None
    rating = section.find('div', {'class': 'avgRating'}).text.replace('\n', ' ').strip()
    
    tags_section = soup.find('div', {'class':'tags'})
    if tags_section:
        tags = tags_section.find_all('a', {'href': re.compile(r'/anime/tags/.*')})
        tags = [tag.text.replace('\n', ' ').strip() for tag in tags]
    else:
        tags = None
    
    cw_section = soup.find('div', {'class':'tags tags--plain'})
    if cw_section:
        content_warnings = [cw.text.replace('\n', ' ').strip() for cw in cw_section.find_all('li')]
    else:
        content_warnings = None
        
    synopsis = soup.find('p').text
    url = soup.find('link', {'href': re.compile(r'https://www.anime-planet.com/anime/')})['href']
    
    return (title, num_eps, studio, season_year, rating, synopsis, tags, content_warnings, url)

In [149]:
with Pool(14) as p:
    list_of_tups = list(p.map(parseInfo, df['html_text']))

In [150]:
anime = pd.DataFrame(list_of_tups, columns=['title', 'num_eps', 'studio', 'season_year', 'rating', 
                                            'synopsis', 'tags', 'content_warnings', 'url'])

In [155]:
anime

Unnamed: 0,title,num_eps,studio,season_year,rating,synopsis,tags,content_warnings,url
0,Gag Manga Biyori 2,TV (12 eps x 5 min),Artland,Summer 2006,3.583 out of 5 from 233 votes,"The lupine detective Usami-chan is back, and r...","[Comedy, Shounen, Crude, Episodic, Gag, Short ...",,https://www.anime-planet.com/anime/gag-manga-b...
1,Fu Yu Nu,Movie (1 ep x 5 min),,,7 needed to calculate an average,No synopsis yet - check back soon!,"[Chinese Animation, Shorts]",,https://www.anime-planet.com/anime/fu-yu-nu
2,Kijeu CSI: Gwahaksusadae,TV (104 eps x 23 min),,,7 needed to calculate an average,No synopsis yet - check back soon!,"[Adventure, Mystery, Korean Animation]",,https://www.anime-planet.com/anime/kijeu-csi-g...
3,Zuoshou Shanglan,Web,,,10 needed to calculate an average,No synopsis yet - check back soon!,"[Sports, Basketball, Chinese Animation]",,https://www.anime-planet.com/anime/zuoshou-sha...
4,Jeonsa Ryan,Movie (1 ep x 80 min),,,10 needed to calculate an average,No synopsis yet - check back soon!,"[Adventure, Fantasy, Family Friendly, Korean A...",,https://www.anime-planet.com/anime/jeonsa-ryan
...,...,...,...,...,...,...,...,...,...
17122,BanG Dream! Movie: Episode of Roselia - Part I...,Movie (1 ep),SANZIGEN,,3.528 out of 5 from 31 votes,No synopsis yet - check back soon!,"[Idols, Music]",,https://www.anime-planet.com/anime/bang-dream-...
17123,Yu Yu Hakusho Picture Drama,DVD Special (1 ep x 6 min),Pierrot,,3.528 out of 5 from 811 votes,No synopsis yet - check back soon!,"[Comedy, Shounen, Picture Drama, Based on a Ma...",,https://www.anime-planet.com/anime/yu-yu-hakus...
17124,Kannagi: If You Are a Shrine Maiden,DVD Special (1 ep x 24 min),A-1 Pictures,,"3.528 out of 5 from 2,082 votes",One day Shino and Takako find a money clip in ...,"[Comedy, Fantasy, Shoujo, Japanese Mythology, ...",,https://www.anime-planet.com/anime/kannagi-if-...
17125,KADO: The Right Answer,TV (12 eps),Toei Animation,Spring 2017,"3.527 out of 5 from 3,203 votes",Koujiro Shindo is a highly-skilled negotiator ...,"[Drama, Sci Fi, Political, CG Animation, Origi...",,https://www.anime-planet.com/anime/kado-the-ri...


In [156]:
anime.memory_usage(deep=True) / 10**6

Index               0.000128
title               1.416130
num_eps             1.286871
studio              0.904515
season_year         0.593286
rating              1.481485
synopsis            6.433812
tags                1.963144
url                 1.994085
dtype: float64

In [242]:
anime['num_eps']

0                TV (12 eps x 5 min)
1               Movie (1 ep x 5 min)
2              TV (104 eps x 23 min)
3                                Web
4              Movie (1 ep x 80 min)
                    ...             
17122                   Movie (1 ep)
17123     DVD Special (1 ep x 6 min)
17124    DVD Special (1 ep x 24 min)
17125                    TV (12 eps)
17126           Web (20 eps x 2 min)
Name: num_eps, Length: 17127, dtype: object

In [379]:
pattern = r"""(?P<type>(?:TV\sSpecial|TV|Movie|OVA|Music\sVideo|Other|DVD\sSpecial|Web)+)(?:\s+\((?P<num_eps>\d+)(?P<ongoing>\+)?\seps?(?:\sx\s(?P<duration>\d+)\smin)?\))?"""
tmp = anime['num_eps'].str.extract(pattern)

In [383]:
tmp['ongoing'] = tmp['ongoing'].notnull()

In [385]:
tmp.loc[tmp['ongoing']]

Unnamed: 0,type,num_eps,ongoing,duration
48,TV,21,True,
66,Web,21,True,1
123,Other,9,True,2
224,Web,5,True,1
254,TV,20,True,
...,...,...,...,...
16680,TV,9,True,
16712,Web,13,True,2
16809,OVA,2,True,
16997,TV,10,True,


In [386]:
anime

Unnamed: 0,title,num_eps,studio,season_year,rating,synopsis,tags,content_warnings,url
0,Gag Manga Biyori 2,TV (12 eps x 5 min),Artland,Summer 2006,3.583 out of 5 from 233 votes,"The lupine detective Usami-chan is back, and r...","[Comedy, Shounen, Crude, Episodic, Gag, Short ...",,https://www.anime-planet.com/anime/gag-manga-b...
1,Fu Yu Nu,Movie (1 ep x 5 min),,,7 needed to calculate an average,No synopsis yet - check back soon!,"[Chinese Animation, Shorts]",,https://www.anime-planet.com/anime/fu-yu-nu
2,Kijeu CSI: Gwahaksusadae,TV (104 eps x 23 min),,,7 needed to calculate an average,No synopsis yet - check back soon!,"[Adventure, Mystery, Korean Animation]",,https://www.anime-planet.com/anime/kijeu-csi-g...
3,Zuoshou Shanglan,Web,,,10 needed to calculate an average,No synopsis yet - check back soon!,"[Sports, Basketball, Chinese Animation]",,https://www.anime-planet.com/anime/zuoshou-sha...
4,Jeonsa Ryan,Movie (1 ep x 80 min),,,10 needed to calculate an average,No synopsis yet - check back soon!,"[Adventure, Fantasy, Family Friendly, Korean A...",,https://www.anime-planet.com/anime/jeonsa-ryan
...,...,...,...,...,...,...,...,...,...
17122,BanG Dream! Movie: Episode of Roselia - Part I...,Movie (1 ep),SANZIGEN,,3.528 out of 5 from 31 votes,No synopsis yet - check back soon!,"[Idols, Music]",,https://www.anime-planet.com/anime/bang-dream-...
17123,Yu Yu Hakusho Picture Drama,DVD Special (1 ep x 6 min),Pierrot,,3.528 out of 5 from 811 votes,No synopsis yet - check back soon!,"[Comedy, Shounen, Picture Drama, Based on a Ma...",,https://www.anime-planet.com/anime/yu-yu-hakus...
17124,Kannagi: If You Are a Shrine Maiden,DVD Special (1 ep x 24 min),A-1 Pictures,,"3.528 out of 5 from 2,082 votes",One day Shino and Takako find a money clip in ...,"[Comedy, Fantasy, Shoujo, Japanese Mythology, ...",,https://www.anime-planet.com/anime/kannagi-if-...
17125,KADO: The Right Answer,TV (12 eps),Toei Animation,Spring 2017,"3.527 out of 5 from 3,203 votes",Koujiro Shindo is a highly-skilled negotiator ...,"[Drama, Sci Fi, Political, CG Animation, Origi...",,https://www.anime-planet.com/anime/kado-the-ri...


In [387]:
tmp.isnull().sum()

type           0
num_eps      465
ongoing        0
duration    5674
dtype: int64

In [388]:
tmp['type'].value_counts()

TV             5175
Movie          3066
OVA            2204
Web            2199
Music Video    2008
Other           918
DVD Special     888
TV Special      669
Name: type, dtype: int64

In [389]:
tmp.isnull().sum()

type           0
num_eps      465
ongoing        0
duration    5674
dtype: int64

In [157]:
anime.to_sql('anime', db, if_exists='replace', index=False, method='multi')