In [1]:
import pickle
import numpy as np
import pandas as pd
import json
import sqlalchemy as sql
from sqlalchemy import create_engine
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
from io import StringIO 
import time
from concurrent.futures import ThreadPoolExecutor
import random
from urllib.parse import quote

In [2]:
with open('../tools/credentials.json') as file:
    credentials = json.load(file)
    
username = credentials["dblogin"]["username"]
password = credentials["dblogin"]["password"]

In [3]:
db_string = f"postgresql://{username}:{password}@localhost:5432/animeplanet"
db = create_engine(db_string)

In [4]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

### Get Anime List

In [None]:
print('scraping anime list...')

In [None]:
base_url = 'https://www.anime-planet.com/anime/top-anime?page='

url = f'{base_url}{1}'
resp = requests.get(f'http://192.168.0.3:5000/special-requests?url={quote(url)}')
soup = BeautifulSoup(resp.text, 'html.parser')
ul = soup.find('ul', attrs={'class':'nav'})

In [None]:
page_nums = []
for tag in ul.find_all('a'):
    try:
        page_nums.append(int(tag.text))
    except:
        continue
        
num_pages = max(page_nums)

In [None]:
def scrapeTable(url):
    resp = requests.get(f'http://192.168.0.3:5000/special-requests?url={quote(url)}')
    if resp.text != '':
        soup = BeautifulSoup(resp.text, 'html.parser')
        table = soup.find('table')
        chunk = pd.read_html(StringIO(str(table)), index_col='Rank')[0][['Title', 'Type', 'Year']]
        chunk['Url'] = [np.where(tag.has_attr('href'), 
                           'https://www.anime-planet.com' + tag.get('href'), 
                           'no link') for tag in table.find_all('a')]
        return chunk
    else:
        return scrapeTable(url)

In [None]:
urls = [f'{base_url}{i}' for i in range(1, num_pages+1)]

with ThreadPoolExecutor(max_workers=10) as executor:
    df_list = list(executor.map(scrapeTable, urls))

In [None]:
df = pd.concat(df_list)

In [None]:
df.columns = [col.lower() for col in df.columns]
df['url'] = df['url'].astype('string')

In [None]:
df = df.loc[~df.duplicated(['url'])].reset_index(drop=True)

In [None]:
print('saving data to file...')
df.to_csv('../data/anime_list.csv', index=False)

with db.connect() as con:
    print('removing from db...')
    query = f"""DELETE FROM anime;"""
    con.execute(sql.text(query))
    
    print('saving data to db...')
    chunks = chunker(df, 1000)
    for chunk in tqdm(chunks):
        chunk.to_sql('anime', con, if_exists='append', index=False, method='multi')

### Scrape Anime Pages

In [5]:
print('scraping anime pages...')

scraping anime pages...


In [6]:
with db.connect() as con:
    df = pd.read_sql('anime', con)

In [7]:
def getPage(url, attempt=1):
    if attempt == 4:
        return (url, '')
    resp = requests.get(f'http://192.168.0.3:5000/special-requests?url={quote(url)}')
    return (url, resp.text) if resp.text != '' else getPage(url, attempt+1)

In [8]:
chunksize = 50

url_list = df['url'].to_list()
url_chunks = chunker(url_list, chunksize)

url_html_dict = {}
for url_chunk in tqdm(url_chunks, total=len(url_list)/chunksize):
    with ThreadPoolExecutor(max_workers=25) as executor:
        list_of_tup = list(executor.map(getPage, url_chunk))
        for tup in list_of_tup:
            url_html_dict[tup[0]] = tup[1]
    time.sleep(random.randint(2, 10))

343it [1:01:07, 10.69s/it]                            


In [9]:
df['html_text'] = df['url'].map(url_html_dict)

In [10]:
print('saving data to file...')
df.to_csv('../data/anime_list_html.csv.xz', index=False)

with db.connect() as con:
    print('removing from db...')
    query = f"""DELETE FROM web_scrape 
                WHERE url in ({str(df['url'].to_list())[1:-1]})"""
    con.execute(sql.text(query))
    print('saving data to db...')
    chunks = chunker(df[['url', 'html_text']], 1000)
    for chunk in tqdm(chunks):
        chunk.to_sql('web_scrape', con, if_exists='append', index=False, method='multi')

saving data to file...
removing from db...


0it [00:00, ?it/s]

saving data to db...


18it [01:13,  4.09s/it]
