In [14]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
import time
import random
import pickle
import json
from itertools import cycle
from concurrent.futures import ThreadPoolExecutor
from sqlalchemy import create_engine

sys.path.insert(0, '../tools/')
from specialRequests import specialRequests

In [2]:
class AnimePlanetCrawler:
    def __init__(self):
        with open('../tools/credentials.json') as file:
            credentials = json.load(file)

        username = credentials["dblogin"]["username"]
        password = credentials["dblogin"]["password"]

        db_string = f"postgresql://{username}:{password}@192.168.0.3:5432/animeplanet"
        self.db = create_engine(db_string)
        
        self.sr = specialRequests()

In [3]:
def loadData(self):
    print('loading data...')
    self.urls = {}
    with self.db.connect() as con:
        self.urls['done'] = set(pd.read_sql('SELECT url FROM web_scrape WHERE html_text IS NOT NULL;', con)['url'].to_list())
        self.urls['todo'] = set(pd.read_sql('SELECT url FROM web_scrape WHERE html_text IS NULL;', con)['url'].to_list())

In [4]:
def saveData(self):
    print('saving data...')
    
    with open('../data/urls.pkl','wb') as file:
        pickle.dump(self.urls, file)
        
    with open("../data/urls.json", 'w') as file:
        json.dump({'done':list(self.urls['done']), 'todo':list(self.urls['todo'])}, file, indent=2) 
        
    page_df = pd.DataFrame(self.page_data)
    batch_df = page_df.loc[page_df['html_text'].notnull()][]
    novel_df = page_df.loc[page_df['html_text'].isnull()]
    
    batch_urls = batch_df['url'].to_list()
    novel_urls = novel_df['url'].to_list()
    
    df_inter = set(batch_urls).intersection(set(novel_urls))
    dict_inter = self.urls['done'].intersection(self.urls['todo'])
    
    if (len(df_inter) != 0) or (len(dict_inter) != 0):
        print('df_inter', df_inter)
        print('dict_inter', dict_inter)
    
    with self.db.connect() as con:
        print('\tremoving popped todo data...')
        con.execute(f"DELETE FROM web_scrape WHERE url in ({str(batch_urls)[1:-1]})")
        
        print('\tsaving done data...')
        batch_df.to_sql('web_scrape', con, index=False, if_exists='append')

        try:
            print('\tsaving todo data...')
            novel_df.to_sql('web_scrape', con, index=False, if_exists='append')
        except Exception as e: 
            print(e)
            con.execute(f"UPDATE web_scrape SET html_text = NULL WHERE url IN ({str(done_urls)[1:-1]})")
            self.urls['done'] = self.urls['done'].difference(batch_urls)
            self.urls['todo'] = self.urls['todo'].difference(novel_urls)
            self.urls['todo'] = self.urls['todo'].union(batch_urls)
        
    del page_df

In [5]:
def scrapePage(self, url):    
 
    cur_urls = set()

    html_text = self.sr.get(url)
    soup = BeautifulSoup(html_text, 'html.parser')

    for link in soup.find_all('a'):
        try:
            branch = link.get('href')
            if branch[0] == '/':
                cur_urls.add('https://www.anime-planet.com' + branch)
        except:
            pass
    
    return cur_urls, (url, html_text)

In [6]:
def popTodoSet(self):
    disallowed_urls = ['https://www.anime-planet.com/search.php', 'https://www.anime-planet.com/login', 
                       'https://www.anime-planet.com/sign-up']
    
    dist_to25 = (25 - (len(self.urls['done']) % 25))
    
    popped_urls = set()
    while len(popped_urls) < dist_to25:
        pop_url = self.urls['todo'].pop()

        if pop_url[-1] == '.':
            old_url = pop_url
            pop_url = pop_url.replace('forum/members', 'users')[:-1]
            if (old_url not in self.urls['done']) and (old_url not in disallowed_urls):
                self.urls['done'].add(old_url)
                self.page_data['url'].append(old_url)
                self.page_data['html_text'].append('failed scrape')
                
        if (pop_url not in self.urls['done']) and (pop_url not in disallowed_urls):
            popped_urls.add(pop_url)
            
    return popped_urls

In [7]:
def processCrawlResults(self, results):
    cur_urls = set().union(*map(lambda x: x[0], results))
    url_html_tup = map(lambda x: x[1], results)
    for url, html_text in url_html_tup:
        self.urls['done'].add(url)
        self.page_data['url'].append(url)
        if html_text == '':
            self.page_data['html_text'].append('failed scrape')
        else:
            self.page_data['html_text'].append(html_text)
      
    novel = (cur_urls.difference(self.urls['todo'])).difference(self.urls['done'])
            
    self.urls['todo'] = self.urls['todo'].union(novel)

    for url in novel:
        self.page_data['url'].append(url)
        self.page_data['html_text'].append(np.NaN)

In [8]:
def printCrawlProgress(self):
    len_done = len(self.urls['done'])
    len_todo = len(self.urls['todo'])
    print(len_todo, len_done, 0 if len_todo == 0 else len_done/(len_todo+len_done))

In [9]:
def crawl(self):
    self.loadData()
    print('starting crawl...')
    self.page_data = {'url':[], 'html_text':[]}
    start_time = time.time()
    while len(self.urls['todo']) > 0:
        
        popped_urls = self.popTodoSet()    
            
        with ThreadPoolExecutor(max_workers=25) as executor:
            results = list(executor.map(self.scrapePage, list(popped_urls)))
        
        self.processCrawlResults(results)
        
        self.printCrawlProgress()
        
        len_done = len(self.urls['done'])
        if len_done % 500 == 0:
            end_time = time.time()
            print('timer: ', end_time-start_time)
            self.saveData()
            self.loadData()
            self.page_data = {'url':[], 'html_text':[]}
            print('starting crawl...')
            start_time = time.time()

In [10]:
AnimePlanetCrawler.loadData = loadData
AnimePlanetCrawler.saveData = saveData
AnimePlanetCrawler.scrapePage = scrapePage
AnimePlanetCrawler.popTodoSet = popTodoSet
AnimePlanetCrawler.processCrawlResults = processCrawlResults
AnimePlanetCrawler.printCrawlProgress = printCrawlProgress
AnimePlanetCrawler.crawl = crawl

In [11]:
crawler = AnimePlanetCrawler()

In [12]:
crawler.crawl()

loading data...
starting crawl...
1572932 576825 0.2683210241901759
1572965 576850 0.26832541404725524
1572992 576875 0.2683305525411572
1573032 576901 0.2683344085606389
1573034 576925 0.26834232652808726
1573064 576950 0.2683470898329034
1573093 576975 0.2683519777048912
1573157 577000 0.2683524970502154
timer:  48.73614525794983
saving data...
	removing popped todo data...
	saving done data...
	saving todo data...
(psycopg2.errors.UniqueViolation) duplicate key value violates unique constraint "web_scrape_pk"
DETAIL:  Key (url)=(https://www.anime-planet.com/users/Pandakilla/manga/read) already exists.

[SQL: INSERT INTO web_scrape (url, html_text) VALUES (%(url)s, %(html_text)s)]
[parameters: ({'url': 'https://www.anime-planet.com/characters/machiko-kano/hates', 'html_text': None}, {'url': 'https://www.anime-planet.com/characters/penguin-2/comments', 'html_text': None}, {'url': 'https://www.anime-planet.com/characters/narrator-gosenzo-sama-banbanzai/lists', 'html_text': None}, {'url

KeyboardInterrupt: 