In [1]:
import pickle
import numpy as np
import pandas as pd
import json
import sqlalchemy as sql
from sqlalchemy import create_engine
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
from io import StringIO
from urllib.parse import quote
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool
import time
import random
import re
import itertools

In [2]:
with open('../tools/credentials.json') as file:
    credentials = json.load(file)
    
username = credentials["dblogin"]["username"]
password = credentials["dblogin"]["password"]

In [3]:
db_string = f"postgresql://{username}:{password}@localhost:5432/animeplanet"
db = create_engine(db_string)

In [4]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

### Parse User Watch List Tables

In [5]:
user_df = pd.read_sql('SELECT * FROM "user" WHERE num_anime_pages > 0;', db)

In [6]:
urls = []

for row in user_df.iterrows():
    username, num_anime_pages = row[1]['username'], row[1]['num_anime_pages']
    for page_num in range(1, num_anime_pages+1):
        if page_num == 1:
            url = f'https://www.anime-planet.com/users/{username}/anime?sort=title&mylist_view=list'
        else:
            url = f'https://www.anime-planet.com/users/{username}/anime?sort=title&mylist_view=list&page={page_num}'
        
        urls.append(url)

query = f"""
        SELECT *
        FROM web_scrape
        WHERE html_text IS NOT NULL
        AND url in ({str(urls)[1:-1]}) 
        LIMIT 1000;
        """
scraped = pd.read_sql(sql.text(query), db)

In [7]:
def parseTable(url_html_tup):
    (url, html_text) = url_html_tup
    try:
        soup = BeautifulSoup(html_text, 'html.parser')
        table = soup.find('table')
        df = pd.read_html(StringIO(str(table)))[0]
        df.columns = ['title', 'type', 'year', 'avg', 'status', 'eps', 'times_watched', 'rating']
        df['time_watched'] = df['times_watched'].str.extract(r'([0-9]*)', expand=False).astype('float')
        df['anime_url'] = [np.where(tag.has_attr('href'), 
                           'https://www.anime-planet.com' + tag.get('href'), 
                           'no link') for tag in [td.find('a') for td in table.find_all('td', attrs={'class':'tableTitle'})]]
        df['anime_url'] = df['anime_url'].astype('string')
        df['username'] = str(re.findall(r'/users/([A-Za-z0-9]*)/', url)[0])
        
        return df
    
    except:
        return pd.DataFrame(columns=['title', 'type', 'year', 'avg', 'status', 
                                     'eps', 'times_watched', 'rating', 'anime_url', 'username'])

In [8]:
chunksize = 40

list_of_tups = [tuple(r) for r in scraped.to_numpy()]
tup_chunks = chunker(list_of_tups, chunksize)

df = pd.DataFrame()
for chunk in tqdm(tup_chunks, total=len(list_of_tups)/chunksize):
    with Pool(4) as p:
        batch_df = pd.concat([*p.map(parseTable, chunk)], ignore_index=True)
        df = pd.concat([df, batch_df], ignore_index=True)
        
    with db.connect() as con:
        query = f"""
                DELETE 
                FROM watch_list
                WHERE (anime_url, username) in 
                    ({str([tuple(r) for r in batch_df[['anime_url', 'username']].to_numpy()])[1:-1]});
                """
        con.execute(sql.text(query))

        batch_df.to_sql('watch_list', con, if_exists='append', index=False, method='multi')

100%|██████████| 25/25.0 [01:24<00:00,  3.38s/it]
