In [1]:
import pickle
import numpy as np
import pandas as pd
import json
import sqlalchemy as sql
from sqlalchemy import create_engine
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
from io import StringIO 
import time
import re
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool
import random
from urllib.parse import quote

In [2]:
with open('../tools/credentials.json') as file:
    credentials = json.load(file)
    
username = credentials["dblogin"]["username"]
password = credentials["dblogin"]["password"]

In [3]:
db_string = f"postgresql://{username}:{password}@192.168.0.3:5432/animeplanet"
db = create_engine(db_string)

In [4]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

### Get Anime List

In [None]:
print('scraping anime list...')

In [None]:
base_url = 'https://www.anime-planet.com/anime/top-anime?page='

url = f'{base_url}{1}'
resp = requests.get(f'http://192.168.0.3:5000/special-requests?url={quote(url)}')
soup = BeautifulSoup(resp.text, 'html.parser')
ul = soup.find('ul', attrs={'class':'nav'})

In [None]:
page_nums = []
for tag in ul.find_all('a'):
    try:
        page_nums.append(int(tag.text))
    except:
        continue
        
num_pages = max(page_nums)

urls = [f'{base_url}{i}' for i in range(1, num_pages+1)]

In [None]:
def scrapeTable(url):
    resp = requests.get(f'http://192.168.0.3:5000/special-requests?url={quote(url)}')
    if resp.text != '':
        soup = BeautifulSoup(resp.text, 'html.parser')
        table = soup.find('table')
        chunk = pd.read_html(StringIO(str(table)), index_col='Rank')[0][['Title', 'Type', 'Year']]
        chunk['url'] = [np.where(tag.has_attr('href'), 
                           'https://www.anime-planet.com' + tag.get('href'), 
                           'no link') for tag in table.find_all('a')]
        chunk.columns = [col.lower() for col in chunk.columns]
        chunk['url'] = chunk['url'].astype('string')
        return chunk
    else:
        return scrapeTable(url)

In [None]:
chunksize = 10
df = pd.DataFrame()

url_chunks = chunker(urls, chunksize)

for idx, url_chunk in enumerate(tqdm(url_chunks, total=int(len(urls)/chunksize)+1), 1):
    with ThreadPoolExecutor(max_workers=chunksize) as executor:
        chunk = pd.concat(list(executor.map(scrapeTable, url_chunk)), ignore_index=True)
        
    df = pd.concat([df, chunk], ignore_index=True)
 
    time.sleep(max(min(np.random.poisson(2), 5), 1))

In [None]:
df = df.drop_duplicates(['url'], ignore_index=True)

In [None]:
print('saving data to file...')
df.to_csv('../data/anime_list.csv.xz', index=False)

with db.connect() as con:
    print('removing from db...')
    query = f"""DELETE FROM anime;"""
    con.execute(sql.text(query))
    
    print('saving data to db...')
    df.to_sql('anime', con, if_exists='append', index=False, method='multi')

### Scrape Anime Pages

In [None]:
print('scraping anime pages...')

In [None]:
df = pd.read_sql('anime', db)

In [None]:
def getPage(url, attempt=1):
    if attempt == 4:
        return (url, '')
    resp = requests.get(f'http://192.168.0.3:5000/special-requests?url={quote(url)}')
    return (url, resp.text) if resp.text != '' else getPage(url, attempt+1)

In [None]:
chunksize = 10

url_list = df['url'].to_list()
url_chunks = chunker(url_list, chunksize)

url_html_dict = {}
for url_chunk in tqdm(url_chunks, total=int(len(url_list)/chunksize)+1):
    with ThreadPoolExecutor(max_workers=chunksize) as executor:
        list_of_tup = list(executor.map(getPage, url_chunk))
        for tup in list_of_tup:
            url_html_dict[tup[0]] = tup[1]
            
    time.sleep(max(min(np.random.poisson(10), 30), 4))

In [None]:
df['html_text'] = df['url'].map(url_html_dict)

In [None]:
print('saving data to file...')
df.to_csv('../data/anime_list_html.csv.xz', index=False)

with db.connect() as con:
    print('removing from db...')
    query = f"""DELETE FROM web_scrape 
                WHERE url in ({str(df['url'].to_list())[1:-1]})"""
    con.execute(sql.text(query))
    print('saving data to db...')
    chunks = chunker(df[['url', 'html_text']], 1000)
    for chunk in tqdm(chunks):
        chunk.to_sql('web_scrape', con, if_exists='append', index=False, method='multi')

### Extracting addition info

In [5]:
df = pd.read_csv('../data/anime_list_html.csv.xz')

In [6]:
df

Unnamed: 0,title,type,year,url,html_text
0,Gag Manga Biyori 2,TV,2006.0,https://www.anime-planet.com/anime/gag-manga-b...,"\n<!doctype html>\n<html xml:lang=""en"" lang=""e..."
1,Fu Yu Nu,Movie,2016.0,https://www.anime-planet.com/anime/fu-yu-nu,"\n<!doctype html>\n<html xml:lang=""en"" lang=""e..."
2,Kijeu CSI: Gwahaksusadae,TV,2012.0,https://www.anime-planet.com/anime/kijeu-csi-g...,"\n<!doctype html>\n<html xml:lang=""en"" lang=""e..."
3,Zuoshou Shanglan,Web,,https://www.anime-planet.com/anime/zuoshou-sha...,"\n<!doctype html>\n<html xml:lang=""en"" lang=""e..."
4,Jeonsa Ryan,Movie,1997.0,https://www.anime-planet.com/anime/jeonsa-ryan,"\n<!doctype html>\n<html xml:lang=""en"" lang=""e..."
...,...,...,...,...,...
17122,BanG Dream! Movie: Episode of Roselia - Part I...,Movie,2021.0,https://www.anime-planet.com/anime/bang-dream-...,"\n<!doctype html>\n<html xml:lang=""en"" lang=""e..."
17123,Yu Yu Hakusho Picture Drama,DVD Special,2009.0,https://www.anime-planet.com/anime/yu-yu-hakus...,"\n<!doctype html>\n<html xml:lang=""en"" lang=""e..."
17124,Kannagi: If You Are a Shrine Maiden,DVD Special,2009.0,https://www.anime-planet.com/anime/kannagi-if-...,"\n<!doctype html>\n<html xml:lang=""en"" lang=""e..."
17125,KADO: The Right Answer,TV,2017.0,https://www.anime-planet.com/anime/kado-the-ri...,"\n<!doctype html>\n<html xml:lang=""en"" lang=""e..."


In [14]:
def email(string):
    r = int(string[:2], 16)
    email = ''.join([chr(int(string[i:i+2], 16) ^ r)
                     for i in range(2, len(string), 2)])
    return email

In [38]:
def parseInfo(html):
    soup = BeautifulSoup(html)
    title = soup.find('h1', {'itemprop':'name'}).text
    if '[email\xa0protected]' in title:
        real_text = email(soup.find('a', attrs={'href': '/cdn-cgi/l/email-protection'})['data-cfemail'])
        title = title.replace('[email\xa0protected]', real_text)

    section = soup.find(attrs={'class': 'pure-g entryBar'})
    num_eps = section.find('span', {'class':'type'})
    if num_eps:
        num_eps = num_eps.text.replace('\n', ' ').strip()
    else:
        num_eps = None
    
    studio = section.find('a', {'href': re.compile(r'/anime/studios/.*')})
    if studio:
        studio = studio.text
    else:
        studio = None
    
    start_end_years = section.find('span', {'class': 'iconYear'})
    if start_end_years:
        start_end_years = start_end_years.text
    else:
        start_end_years = None
    
    season_year = section.find('a', {'href': re.compile(r'/anime/seasons/.*')})
    if season_year:
        season_year = season_year.text
    else:
        season_year = None
        
    rating = section.find('div', {'class': 'avgRating'}).text.replace('\n', ' ').strip()
    
    tags_section = soup.find('div', {'class':'tags'})
    if tags_section:
        tags = tags_section.find_all('a', {'href': re.compile(r'/anime/tags/.*')})
        tags = [tag.text.replace('\n', ' ').strip() for tag in tags]
    else:
        tags = None
    
    cw_section = soup.find('div', {'class':'tags tags--plain'})
    if cw_section:
        content_warnings = [cw.text.replace('\n', ' ').replace(',', '').strip() for cw in cw_section.find_all('li')]
    else:
        content_warnings = None
        
    synopsis = soup.find('p').text
    url = soup.find('link', {'href': re.compile(r'https://www.anime-planet.com/anime/')})['href']
    
    return (title, num_eps, studio, start_end_years, season_year, rating, synopsis, tags, content_warnings, url)

In [39]:
with Pool(14) as p:
    list_of_tups = list(p.map(parseInfo, df['html_text']))

In [40]:
anime = pd.DataFrame(list_of_tups, columns=['title', 'num_eps', 'studio', 'start_end_years', 'season_year', 'rating', 
                                            'synopsis', 'tags', 'content_warnings', 'url'])

In [41]:
anime.to_csv('../data/anime_raw.csv.xz', index=False)
anime.to_pickle('../data/anime_raw.pkl.xz')