In [1]:
import pickle
import numpy as np
import pandas as pd
import json
import sqlalchemy as sql
from sqlalchemy import create_engine
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
from io import StringIO 
import time
import re
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool
import random
from urllib.parse import quote

In [2]:
with open('../tools/credentials.json') as file:
    credentials = json.load(file)
    
username = credentials["dblogin"]["username"]
password = credentials["dblogin"]["password"]

In [3]:
db_string = f"postgresql://{username}:{password}@192.168.0.3:5432/animeplanet"
db = create_engine(db_string)

In [4]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

### Get Anime List

In [None]:
print('scraping anime list...')

In [None]:
base_url = 'https://www.anime-planet.com/anime/top-anime?page='

url = f'{base_url}{1}'
resp = requests.get(f'http://192.168.0.3:5000/special-requests?url={quote(url)}')
soup = BeautifulSoup(resp.text, 'html.parser')
ul = soup.find('ul', attrs={'class':'nav'})

In [None]:
page_nums = []
for tag in ul.find_all('a'):
    try:
        page_nums.append(int(tag.text))
    except:
        continue
        
num_pages = max(page_nums)

urls = [f'{base_url}{i}' for i in range(1, num_pages+1)]

In [None]:
def scrapeTable(url):
    resp = requests.get(f'http://192.168.0.3:5000/special-requests?url={quote(url)}')
    if resp.text != '':
        soup = BeautifulSoup(resp.text, 'html.parser')
        table = soup.find('table')
        chunk = pd.read_html(StringIO(str(table)), index_col='Rank')[0][['Title', 'Type', 'Year']]
        chunk['url'] = [np.where(tag.has_attr('href'), 
                           'https://www.anime-planet.com' + tag.get('href'), 
                           'no link') for tag in table.find_all('a')]
        chunk.columns = [col.lower() for col in chunk.columns]
        chunk['url'] = chunk['url'].astype('string')
        return chunk
    else:
        return scrapeTable(url)

In [None]:
chunksize = 10
df = pd.DataFrame()

url_chunks = chunker(urls, chunksize)

for idx, url_chunk in enumerate(tqdm(url_chunks, total=int(len(urls)/chunksize)+1), 1):
    with ThreadPoolExecutor(max_workers=chunksize) as executor:
        chunk = pd.concat(list(executor.map(scrapeTable, url_chunk)), ignore_index=True)
        
    df = pd.concat([df, chunk], ignore_index=True)
 
    time.sleep(max(min(np.random.poisson(2), 5), 1))

In [None]:
df = df.drop_duplicates(['url'], ignore_index=True)

In [None]:
print('saving data to file...')
df.to_csv('../data/anime_list.csv.xz', index=False)

with db.connect() as con:
    print('removing from db...')
    query = f"""DELETE FROM anime;"""
    con.execute(sql.text(query))
    
    print('saving data to db...')
    df.to_sql('anime', con, if_exists='append', index=False, method='multi')

### Scrape Anime Pages

In [None]:
print('scraping anime pages...')

In [None]:
df = pd.read_sql('anime', db)

In [None]:
def getPage(url, attempt=1):
    if attempt == 4:
        return (url, '')
    resp = requests.get(f'http://192.168.0.3:5000/special-requests?url={quote(url)}')
    return (url, resp.text) if resp.text != '' else getPage(url, attempt+1)

In [None]:
chunksize = 10

url_list = df['url'].to_list()
url_chunks = chunker(url_list, chunksize)

url_html_dict = {}
for url_chunk in tqdm(url_chunks, total=int(len(url_list)/chunksize)+1):
    with ThreadPoolExecutor(max_workers=chunksize) as executor:
        list_of_tup = list(executor.map(getPage, url_chunk))
        for tup in list_of_tup:
            url_html_dict[tup[0]] = tup[1]
            
    time.sleep(max(min(np.random.poisson(10), 30), 4))

In [None]:
df['html_text'] = df['url'].map(url_html_dict)

In [None]:
print('saving data to file...')
df.to_csv('../data/anime_list_html.csv.xz', index=False)

with db.connect() as con:
    print('removing from db...')
    query = f"""DELETE FROM web_scrape 
                WHERE url in ({str(df['url'].to_list())[1:-1]})"""
    con.execute(sql.text(query))
    print('saving data to db...')
    chunks = chunker(df[['url', 'html_text']], 1000)
    for chunk in tqdm(chunks):
        chunk.to_sql('web_scrape', con, if_exists='append', index=False, method='multi')

### Extracting addition info

In [5]:
df = pd.read_csv('../data/anime_list_html.csv.xz')

In [6]:
with open('../data/test.html', 'w') as file:
    file.write(df.loc[df['title'] == 'Fullmetal Alchemist: Brotherhood', 'html_text'].iloc[0])

In [7]:
def parseInfo(html):
    soup = BeautifulSoup(html)
    title = soup.find('h1', {'itemprop':'name'}).text

    section = soup.find(attrs={'class': 'pure-g entryBar'})
    num_eps = section.find('span', {'class':'type'})
    if num_eps:
        num_eps = num_eps.text.replace('\n', ' ').strip()
    else:
        num_eps = None
    
    studio = section.find('a', {'href': re.compile(r'/anime/studios/.*')})
    if studio:
        studio = studio.text
    else:
        studio = None
    
    start_end_years = section.find('span', {'class': 'iconYear'})
    if start_end_years:
        start_end_years = start_end_years.text
    else:
        start_end_years = None
    
    season_year = section.find('a', {'href': re.compile(r'/anime/seasons/.*')})
    if season_year:
        season_year = season_year.text
    else:
        season_year = None
        
    rating = section.find('div', {'class': 'avgRating'}).text.replace('\n', ' ').strip()
    
    tags_section = soup.find('div', {'class':'tags'})
    if tags_section:
        tags = tags_section.find_all('a', {'href': re.compile(r'/anime/tags/.*')})
        tags = [tag.text.replace('\n', ' ').strip() for tag in tags]
    else:
        tags = None
    
    cw_section = soup.find('div', {'class':'tags tags--plain'})
    if cw_section:
        content_warnings = [cw.text.replace('\n', ' ').strip() for cw in cw_section.find_all('li')]
    else:
        content_warnings = None
        
    synopsis = soup.find('p').text
    url = soup.find('link', {'href': re.compile(r'https://www.anime-planet.com/anime/')})['href']
    
    return (title, num_eps, studio, start_end_years, season_year, rating, synopsis, tags, content_warnings, url)

In [8]:
with Pool(14) as p:
    list_of_tups = list(p.map(parseInfo, df['html_text']))

In [9]:
anime = pd.DataFrame(list_of_tups, columns=['title', 'num_eps', 'studio', 'start_end_years', 'season_year', 'rating', 
                                            'synopsis', 'tags', 'content_warnings', 'url'])

### Cleaning Anime Data

#### `num_eps`

In [10]:
anime['num_eps']

0                TV (12 eps x 5 min)
1               Movie (1 ep x 5 min)
2              TV (104 eps x 23 min)
3                                Web
4              Movie (1 ep x 80 min)
                    ...             
17122                   Movie (1 ep)
17123     DVD Special (1 ep x 6 min)
17124    DVD Special (1 ep x 24 min)
17125                    TV (12 eps)
17126           Web (20 eps x 2 min)
Name: num_eps, Length: 17127, dtype: object

In [11]:
pattern = r"""(?P<type>(?:TV\sSpecial|TV|Movie|OVA|Music\sVideo|Other|DVD\sSpecial|Web)+)(?:\s+\((?P<num_eps>\d+)(?P<ongoing>\+)?\seps?(?:\sx\s(?P<duration>\d+)\smin)?\))?"""
tmp = anime['num_eps'].str.extract(pattern)
tmp['ongoing'] = tmp['ongoing'].notnull()

In [12]:
anime[['type', 'num_eps', 'ongoing', 'duration']] = tmp

In [13]:
anime

Unnamed: 0,title,num_eps,studio,start_end_years,season_year,rating,synopsis,tags,content_warnings,url,type,ongoing,duration
0,Gag Manga Biyori 2,12,Artland,2006,Summer 2006,3.583 out of 5 from 233 votes,"The lupine detective Usami-chan is back, and r...","[Comedy, Shounen, Crude, Episodic, Gag, Short ...",,https://www.anime-planet.com/anime/gag-manga-b...,TV,False,5
1,Fu Yu Nu,1,,2016,,7 needed to calculate an average,No synopsis yet - check back soon!,"[Chinese Animation, Shorts]",,https://www.anime-planet.com/anime/fu-yu-nu,Movie,False,5
2,Kijeu CSI: Gwahaksusadae,104,,2012 - 2014,,7 needed to calculate an average,No synopsis yet - check back soon!,"[Adventure, Mystery, Korean Animation]",,https://www.anime-planet.com/anime/kijeu-csi-g...,TV,False,23
3,Zuoshou Shanglan,,,TBA,,10 needed to calculate an average,No synopsis yet - check back soon!,"[Sports, Basketball, Chinese Animation]",,https://www.anime-planet.com/anime/zuoshou-sha...,Web,False,
4,Jeonsa Ryan,1,,1997,,10 needed to calculate an average,No synopsis yet - check back soon!,"[Adventure, Fantasy, Family Friendly, Korean A...",,https://www.anime-planet.com/anime/jeonsa-ryan,Movie,False,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17122,BanG Dream! Movie: Episode of Roselia - Part I...,1,SANZIGEN,2021,,3.528 out of 5 from 31 votes,No synopsis yet - check back soon!,"[Idols, Music]",,https://www.anime-planet.com/anime/bang-dream-...,Movie,False,
17123,Yu Yu Hakusho Picture Drama,1,Pierrot,2009,,3.528 out of 5 from 811 votes,No synopsis yet - check back soon!,"[Comedy, Shounen, Picture Drama, Based on a Ma...",,https://www.anime-planet.com/anime/yu-yu-hakus...,DVD Special,False,6
17124,Kannagi: If You Are a Shrine Maiden,1,A-1 Pictures,2009,,"3.528 out of 5 from 2,082 votes",One day Shino and Takako find a money clip in ...,"[Comedy, Fantasy, Shoujo, Japanese Mythology, ...",,https://www.anime-planet.com/anime/kannagi-if-...,DVD Special,False,24
17125,KADO: The Right Answer,12,Toei Animation,2017,Spring 2017,"3.527 out of 5 from 3,203 votes",Koujiro Shindo is a highly-skilled negotiator ...,"[Drama, Sci Fi, Political, CG Animation, Origi...",,https://www.anime-planet.com/anime/kado-the-ri...,TV,False,


#### `start_end_years`

In [17]:
anime['start_end_years'] = anime['start_end_years'].str.strip()

In [64]:
anime[['start_year', 'end_year']] = \
    anime['start_end_years'].str.extract(r'(?P<start_year>(?:(?:\d+)|TBA))(?:\s-\s(?P<end_year>(?:(?:\d+)|\?)))?')

In [69]:
anime['end_year'] = anime['end_year'].replace({'?': 'TBA'})

In [72]:
anime.loc[anime['end_year'].isnull(), 'end_year'] = \
    anime.loc[anime['end_year'].isnull(), 'start_year']

In [73]:
anime[['start_year', 'end_year']].isnull().sum()

start_year    0
end_year      0
dtype: int64

In [74]:
del anime['start_end_years']

#### `season_year`

In [75]:
anime['season'] = anime['season_year'].str.extract(r'(?P<season>[A-Za-z]+)')
anime = anime.drop(columns=['season_year'])

In [76]:
anime

Unnamed: 0,title,num_eps,studio,rating,synopsis,tags,content_warnings,url,type,ongoing,duration,start_year,end_year,season
0,Gag Manga Biyori 2,12,Artland,3.583 out of 5 from 233 votes,"The lupine detective Usami-chan is back, and r...","[Comedy, Shounen, Crude, Episodic, Gag, Short ...",,https://www.anime-planet.com/anime/gag-manga-b...,TV,False,5,2006,2006,Summer
1,Fu Yu Nu,1,,7 needed to calculate an average,No synopsis yet - check back soon!,"[Chinese Animation, Shorts]",,https://www.anime-planet.com/anime/fu-yu-nu,Movie,False,5,2016,2016,
2,Kijeu CSI: Gwahaksusadae,104,,7 needed to calculate an average,No synopsis yet - check back soon!,"[Adventure, Mystery, Korean Animation]",,https://www.anime-planet.com/anime/kijeu-csi-g...,TV,False,23,2012,2014,
3,Zuoshou Shanglan,,,10 needed to calculate an average,No synopsis yet - check back soon!,"[Sports, Basketball, Chinese Animation]",,https://www.anime-planet.com/anime/zuoshou-sha...,Web,False,,TBA,TBA,
4,Jeonsa Ryan,1,,10 needed to calculate an average,No synopsis yet - check back soon!,"[Adventure, Fantasy, Family Friendly, Korean A...",,https://www.anime-planet.com/anime/jeonsa-ryan,Movie,False,80,1997,1997,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17122,BanG Dream! Movie: Episode of Roselia - Part I...,1,SANZIGEN,3.528 out of 5 from 31 votes,No synopsis yet - check back soon!,"[Idols, Music]",,https://www.anime-planet.com/anime/bang-dream-...,Movie,False,,2021,2021,
17123,Yu Yu Hakusho Picture Drama,1,Pierrot,3.528 out of 5 from 811 votes,No synopsis yet - check back soon!,"[Comedy, Shounen, Picture Drama, Based on a Ma...",,https://www.anime-planet.com/anime/yu-yu-hakus...,DVD Special,False,6,2009,2009,
17124,Kannagi: If You Are a Shrine Maiden,1,A-1 Pictures,"3.528 out of 5 from 2,082 votes",One day Shino and Takako find a money clip in ...,"[Comedy, Fantasy, Shoujo, Japanese Mythology, ...",,https://www.anime-planet.com/anime/kannagi-if-...,DVD Special,False,24,2009,2009,
17125,KADO: The Right Answer,12,Toei Animation,"3.527 out of 5 from 3,203 votes",Koujiro Shindo is a highly-skilled negotiator ...,"[Drama, Sci Fi, Political, CG Animation, Origi...",,https://www.anime-planet.com/anime/kado-the-ri...,TV,False,,2017,2017,Spring


#### `rating`

In [77]:
anime['rating'] = anime['rating'].str.replace('.* needed to calculate an average', '', regex=True).replace('', np.NaN)

In [78]:
anime['rating']

0          3.583 out of 5 from 233 votes
1                                    NaN
2                                    NaN
3                                    NaN
4                                    NaN
                      ...               
17122       3.528 out of 5 from 31 votes
17123      3.528 out of 5 from 811 votes
17124    3.528 out of 5 from 2,082 votes
17125    3.527 out of 5 from 3,203 votes
17126       3.527 out of 5 from 14 votes
Name: rating, Length: 17127, dtype: object

In [79]:
anime[['rating', 'num_votes']] = anime['rating'].str.extract(r'(?P<rating>\d\.?\d*) out of 5 from (?P<num_votes>[\d,]+) votes')
anime['num_votes'] = anime['num_votes'].str.replace(',','')

In [81]:
anime

Unnamed: 0,title,num_eps,studio,rating,synopsis,tags,content_warnings,url,type,ongoing,duration,start_year,end_year,season,num_votes
0,Gag Manga Biyori 2,12,Artland,3.583,"The lupine detective Usami-chan is back, and r...","[Comedy, Shounen, Crude, Episodic, Gag, Short ...",,https://www.anime-planet.com/anime/gag-manga-b...,TV,False,5,2006,2006,Summer,233
1,Fu Yu Nu,1,,,No synopsis yet - check back soon!,"[Chinese Animation, Shorts]",,https://www.anime-planet.com/anime/fu-yu-nu,Movie,False,5,2016,2016,,
2,Kijeu CSI: Gwahaksusadae,104,,,No synopsis yet - check back soon!,"[Adventure, Mystery, Korean Animation]",,https://www.anime-planet.com/anime/kijeu-csi-g...,TV,False,23,2012,2014,,
3,Zuoshou Shanglan,,,,No synopsis yet - check back soon!,"[Sports, Basketball, Chinese Animation]",,https://www.anime-planet.com/anime/zuoshou-sha...,Web,False,,TBA,TBA,,
4,Jeonsa Ryan,1,,,No synopsis yet - check back soon!,"[Adventure, Fantasy, Family Friendly, Korean A...",,https://www.anime-planet.com/anime/jeonsa-ryan,Movie,False,80,1997,1997,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17122,BanG Dream! Movie: Episode of Roselia - Part I...,1,SANZIGEN,3.528,No synopsis yet - check back soon!,"[Idols, Music]",,https://www.anime-planet.com/anime/bang-dream-...,Movie,False,,2021,2021,,31
17123,Yu Yu Hakusho Picture Drama,1,Pierrot,3.528,No synopsis yet - check back soon!,"[Comedy, Shounen, Picture Drama, Based on a Ma...",,https://www.anime-planet.com/anime/yu-yu-hakus...,DVD Special,False,6,2009,2009,,811
17124,Kannagi: If You Are a Shrine Maiden,1,A-1 Pictures,3.528,One day Shino and Takako find a money clip in ...,"[Comedy, Fantasy, Shoujo, Japanese Mythology, ...",,https://www.anime-planet.com/anime/kannagi-if-...,DVD Special,False,24,2009,2009,,2082
17125,KADO: The Right Answer,12,Toei Animation,3.527,Koujiro Shindo is a highly-skilled negotiator ...,"[Drama, Sci Fi, Political, CG Animation, Origi...",,https://www.anime-planet.com/anime/kado-the-ri...,TV,False,,2017,2017,Spring,3203


In [82]:
anime['duration'].astype('float').max()

235.0

#### Change datatypes

In [83]:
dtypes_dict = \
{
    'title': 'string',
    'type': 'category',
    'num_eps': 'float32',
    'ongoing': 'bool',
    'duration': 'float32',
    'studio': 'category',
    'start_year': 'category',
    'end_year': 'category',
    'season': 'category',
    'rating': 'float32',
    'num_votes': 'float32',
    'synopsis': 'string',
    'tags': 'object',
    'content_warnings': 'object',
    'url': 'string'
}
anime = anime.astype(dtypes_dict)[list(dtypes_dict.keys())]

In [84]:
anime

Unnamed: 0,title,type,num_eps,ongoing,duration,studio,start_year,end_year,season,rating,num_votes,synopsis,tags,content_warnings,url
0,Gag Manga Biyori 2,TV,12.0,False,5.0,Artland,2006,2006,Summer,3.583,233.0,"The lupine detective Usami-chan is back, and r...","[Comedy, Shounen, Crude, Episodic, Gag, Short ...",,https://www.anime-planet.com/anime/gag-manga-b...
1,Fu Yu Nu,Movie,1.0,False,5.0,,2016,2016,,,,No synopsis yet - check back soon!,"[Chinese Animation, Shorts]",,https://www.anime-planet.com/anime/fu-yu-nu
2,Kijeu CSI: Gwahaksusadae,TV,104.0,False,23.0,,2012,2014,,,,No synopsis yet - check back soon!,"[Adventure, Mystery, Korean Animation]",,https://www.anime-planet.com/anime/kijeu-csi-g...
3,Zuoshou Shanglan,Web,,False,,,TBA,TBA,,,,No synopsis yet - check back soon!,"[Sports, Basketball, Chinese Animation]",,https://www.anime-planet.com/anime/zuoshou-sha...
4,Jeonsa Ryan,Movie,1.0,False,80.0,,1997,1997,,,,No synopsis yet - check back soon!,"[Adventure, Fantasy, Family Friendly, Korean A...",,https://www.anime-planet.com/anime/jeonsa-ryan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17122,BanG Dream! Movie: Episode of Roselia - Part I...,Movie,1.0,False,,SANZIGEN,2021,2021,,3.528,31.0,No synopsis yet - check back soon!,"[Idols, Music]",,https://www.anime-planet.com/anime/bang-dream-...
17123,Yu Yu Hakusho Picture Drama,DVD Special,1.0,False,6.0,Pierrot,2009,2009,,3.528,811.0,No synopsis yet - check back soon!,"[Comedy, Shounen, Picture Drama, Based on a Ma...",,https://www.anime-planet.com/anime/yu-yu-hakus...
17124,Kannagi: If You Are a Shrine Maiden,DVD Special,1.0,False,24.0,A-1 Pictures,2009,2009,,3.528,2082.0,One day Shino and Takako find a money clip in ...,"[Comedy, Fantasy, Shoujo, Japanese Mythology, ...",,https://www.anime-planet.com/anime/kannagi-if-...
17125,KADO: The Right Answer,TV,12.0,False,,Toei Animation,2017,2017,Spring,3.527,3203.0,Koujiro Shindo is a highly-skilled negotiator ...,"[Drama, Sci Fi, Political, CG Animation, Origi...",,https://www.anime-planet.com/anime/kado-the-ri...


In [85]:
anime.to_csv('../data/anime.csv.xz', index=False)
anime.to_pickle('../data/anime.pkl.xz')
anime.to_sql('anime', db, if_exists='replace', index=False, method='multi')