In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys
import time
import random
import pickle
import json
from itertools import cycle
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool
import sqlalchemy
from sqlalchemy import create_engine

# sys.path.insert(0, '../tools/')
# from specialRequests import specialRequests

In [2]:
class AnimePlanetCrawler:
    def __init__(self):
        with open('../tools/credentials.json') as file:
            credentials = json.load(file)

        username = credentials["dblogin"]["username"]
        password = credentials["dblogin"]["password"]

        db_string = f"postgresql://{username}:{password}@192.168.0.3:5432/animeplanet"
        self.db = create_engine(db_string)
        
#         self.sr = specialRequests()

In [3]:
def loadData(self):
    print('loading data...')
    with self.db.connect() as con:
        query = """SELECT url 
                    FROM web_scrape 
                    WHERE html_text IS NOT NULL;"""
        self.done = set(pd.read_sql(sqlalchemy.text(query), con)['url'].to_list())
        
        query = """SELECT url 
                    FROM web_scrape 
                    WHERE html_text IS NULL;"""
        self.pending = set(pd.read_sql(sqlalchemy.text(query), con)['url'].to_list())
        
        self.novel = set()
        self.batch = {}

In [4]:
def saveData(self):
    print('saving data...')
    
    self.novel = self.novel.difference(set(self.batch.keys()))
    self.pending = self.pending.difference(set(self.batch.keys()))
    
    data_dict = {'done': list(self.done), 
                 'pending': list(self.pending),
                 'novel': list(self.novel),
                 'batch': self.batch}
    
    with open('../data/urls.pkl','wb') as file:
        pickle.dump(data_dict, file)
    
    with open("../data/urls.json", 'w') as file:
        json.dump(data_dict, file, indent=2) 
        
    batch_dict = {'url': list(self.batch.keys()),
                  'html_text': list(self.batch.values())}
    batch_df = pd.DataFrame(batch_dict)
    
    novel_dict = {'url': list(self.novel),
                  'html_text': [np.NaN for _ in range(len(self.novel))]}
    novel_df = pd.DataFrame(novel_dict)
    
    batch_urls = batch_dict['url']
    novel_urls = novel_dict['url']
    
    with self.db.connect() as con:
        print('\tremoving popped pending data...')
        query = f"""DELETE FROM web_scrape 
                    WHERE url in ({str(batch_urls)[1:-1]})"""
        con.execute(sqlalchemy.text(query))
        
        print('\tsaving done data...')
        batch_df.to_sql('web_scrape', con, index=False, if_exists='append')

        try:
            print('\tsaving pending data...')
            novel_df.to_sql('web_scrape', con, index=False, if_exists='append')
        except Exception as e: 
            print(e)
            query = f"""UPDATE web_scrape 
                        SET html_text = NULL 
                        WHERE url IN ({str(batch_urls)[1:-1]})"""
            con.execute(sqlalchemy.text(query))
            self.done = self.done.difference(batch_urls)
            self.pending = self.pending.difference(novel_urls)
            self.pending = self.pending.union(batch_urls)
        
    self.batch = {}
    self.novel = set()

In [5]:
def scrapePage(url):
    
    if ('forum/members' in url) and (url[-1] == '.'):
        return (url, '')

    resp = requests.get(f'http://192.168.0.3:5000/special-requests?url={url}')
    html_text = resp.text
#     html_text = self.sr.get(url)
    
    return (url, html_text)

In [6]:
def parsePage(html_text):
    if html_text == '':
        return set()

    soup = BeautifulSoup(html_text, 'html.parser')
    
    links = [str(a.get('href')) for a in soup.find_all('a')]
    in_domain_links = filter(lambda x: x and x[0] == '/', links)
    cur_urls = set([f'https://www.anime-planet.com{link}' for link in in_domain_links])
    
    return cur_urls

In [7]:
def popBatch(self):
    
    dist_to50 = 50 - (len(self.done) % 50)
    
    popped_urls = set()
    while len(popped_urls) < dist_to50:
        pop_url = self.pending.pop()

        if pop_url[-1] == '.':
            new_url = pop_url.replace('forum/members', 'users')[:-1]
            self.pending.add(new_url)
                
        popped_urls.add(pop_url)
            
    return popped_urls

In [8]:
def processCrawlResults(self, url_html_tup):
    html_text_list = [x[1] for x in url_html_tup]
    with Pool(2) as p:
        cur_urls_set_list = p.map(parsePage, html_text_list)
    cur_urls = set().union(*cur_urls_set_list)
    for url, html_text in url_html_tup:
        self.done.add(url)
        self.batch[url] = 'failed scrape' if html_text == '' else html_text
    
    cur_urls = (cur_urls.difference(self.pending)).difference(self.done)
    self.novel.update(cur_urls)
    self.pending.update(cur_urls)

In [9]:
def printCrawlProgress(self):
    len_done = len(self.done)
    len_pending = len(self.pending)
    print(len_pending, len_done, 0 if len_pending == 0 else len_done/(len_pending+len_done))
    return len_done

In [10]:
def waiter(secs):
    print(f'waiting {secs} secs...')
    for _ in tqdm(range(secs)):
        time.sleep(1)

In [11]:
def crawl(self):
    self.loadData()
    print('starting crawl...')
    start_time = time.time()
    
    while len(self.pending) > 0:

        popped_urls = self.popBatch()    

        with ThreadPoolExecutor(max_workers=50) as executor:
            url_html_tup = list(executor.map(scrapePage, popped_urls))
        
        self.processCrawlResults(url_html_tup)

        len_done = self.printCrawlProgress()

        
        if len_done % 100 == 0:
            end_time = time.time()
            print('timer: ', end_time-start_time)
            if len_done % 1000 == 0:
                self.saveData()
                if len_done % 500000 == 0:
                    sleep_time = random.randint(3600*3, 3600*5)
                    waiter(sleep_time)
                elif len_done % 100000 == 0:
                    sleep_time = random.randint(1800, 3600)
                    waiter(sleep_time)
                elif len_done % 10000 == 0:
                    sleep_time = random.randint(300, 600)
                    waiter(sleep_time)
                print('starting crawl...')
                
            else:
                time.sleep(random.randint(5, 10))
                
            start_time = time.time()

In [12]:
AnimePlanetCrawler.loadData = loadData
AnimePlanetCrawler.saveData = saveData
AnimePlanetCrawler.popBatch = popBatch
AnimePlanetCrawler.processCrawlResults = processCrawlResults
AnimePlanetCrawler.printCrawlProgress = printCrawlProgress
AnimePlanetCrawler.crawl = crawl

In [13]:
crawler = AnimePlanetCrawler()

In [14]:
crawler.crawl()

loading data...
starting crawl...
2400162 1568525 0.39522517144839087
2400769 1568550 0.3951685415054824
2401183 1568575 0.3951311389762298
2401710 1568600 0.395082499855175
2402466 1568625 0.39501109392859546
2403160 1568650 0.3949458810970313
2403663 1568675 0.39489967872824516
2404158 1568700 0.3948542837423336
2404592 1568725 0.39481496190714205
2405162 1568750 0.39476213866839527
2405641 1568775 0.3947183686861164
2406159 1568800 0.39467073748433634
2406669 1568825 0.39462391340547864
2407182 1568850 0.3945768042108313
2407856 1568875 0.39451373502507464
2408200 1568900 0.3944834175655628
2408753 1568925 0.39443237989600965
2409341 1568950 0.39437788738933377
2409757 1568975 0.3943404582163363
2410267 1569000 0.3942937229394258
timer:  190.86284804344177
saving data...
	removing popped pending data...
	saving done data...
	saving pending data...


  0%|          | 0/30 [00:00<?, ?it/s]

waiting 30 secs...


 93%|█████████▎| 28/30 [00:29<00:02,  1.04s/it]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/alancmathew/miniconda3/envs/anime-recommendation-engine/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-14-26686de0f395>", line 1, in <module>
    crawler.crawl()
  File "<ipython-input-11-9634bf06ef52>", line 29, in crawl
    self.waiter(30)
  File "<ipython-input-10-059a845fbe5b>", line 4, in waiter
    time.sleep(1)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/alancmathew/miniconda3/envs/anime-recommendation-engine/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2061, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "

TypeError: object of type 'NoneType' has no len()