In [1]:
import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.common.by import By
from datetime import datetime, timedelta
from pysqlite import Sqlite
import uuid
import time
import os


In [None]:
# Helpers
def remove_invalid_chars(string):
    return "".join([c for c in string if c.isalpha() ])

def create_folder(story_name, root="stories"):
    folder = os.path.join(root, story_name )
    if not os.path.exists(folder):
        os.makedirs(folder)
    
    return folder

def save_chapter_text(folder, chap_id, chap_text):
    filename = os.path.join(folder, f"chapter_{chap_id}.txt")
    with open(filename, 'w') as f:
        f.write(chap_text)

In [2]:
class BoxNovelTracker:
    __url__    = "https://boxnovel.com/"
    __source__ = "BoxNovel"
    
    
    def __init__(self):
        self._db_engine     = Sqlite.init("story_data.db")
        self.load_stories   ()
        self.__filesfolder__= f"{self.__source__}_files"
        
        if not os.path.exists (self.__filesfolder__):
            os.makedirs(self.__filesfolder__)
    
    @property
    def datetime(self):
        return datetime.now().strftime("%Y-%d-%m %H:%M:%S.%f")
    
    @property
    def stories(self):
        self.load_stories()
        return self._stories
    
    def _get_file_path(self, storyid):
        uid    = str(uuid.uuid4()).replace("-", "")
        folder = f"{self.__filesfolder__}/{storyid}" 
        if not os.path.exists(folder):
            os.makedirs(folder)
            
        return f"{folder}/{storyid}{uid}.txt"
    def process_releaseddate(self, released_date):
        if 'day ago' in released_date:
             dt = datetime.now() - timedelta(days=1)
        elif 'days ago' in released_date:
            dt = datetime.now() - timedelta( days=int( released_date.split(" ")[0] ) )
        elif 'hours ago' in released_date:
            dt = datetime.now() - timedelta( hours= int(released_date.split(" ")[0]) )
        elif 'hour ago' in released_date:
            dt = datetime.now() - timedelta( hours=1)
        else:
            dt = datetime.strptime(released_date, "%B %d, %Y")
        
        return dt.strftime("%Y-%m-%d %H:%M:%S.%f")
    
    def load_stories(self):
        data = self._db_engine.read("story", "*", where=f"source='{self.__source__}'")
        self._stories = { s['name']:s for s in data} if len(data) > 0 else {}
        
    def exists(self, name):
        """ return true if story already exists otherwise false """
        return True if name in self._stories.keys() else False
    
    def latest_update(self, storyid):
        """ return latest update of given story if update exists otherwise None """
        return self._db_engine.read('storydata', '*', where=f'storyid={storyid}', limit=1, orderby="entrydate DESC", single=True)
    
    def get_existing_chapters(self, storyid):
        """ return all of the scrapped chapters """
        return self._db_engine.read('storydata', '*', where=f'storyid={storyid}', orderby="entrydate DESC")
    
    def save(self, name, url):
        """ save new story into db and reload stories """
        story = {"name":name, "url":url, "source":self.__source__, "sourceurl":self.__url__, "entrydate":self.datetime}
        if self._db_engine.save("story", story):
            self.load_stories(); return True
        else:
            return False
        
    def save_storydata(self, storyid, chapname, releaseddate, content):
        filepath = self._get_file_path(storyid)
        with open(filepath, 'w') as file:
            file.write(content)
        
        released_date = self.process_releaseddate(releaseddate)
        record        = { 'storyid':storyid, 'chaptername':chapname, 'releaseddate':released_date, 
                          'datafile':filepath, 'entrydate':self.datetime}
        return self._db_engine.save('storydata', record)
    
        
    
        

In [3]:
opt = webdriver.ChromeOptions()
opt.add_argument("--start-maximized")

chromedriver_autoinstaller.install()

'C:\\Users\\H P\\Envs\\scrapperenv\\lib\\site-packages\\chromedriver_autoinstaller\\105\\chromedriver.exe'

In [4]:
driver = webdriver.Chrome(options=opt)

## Fetch New Stories 

In [17]:
tracker = BoxNovelTracker()

In [18]:
page_no = 1
MAX_TRY = 3
tracker = BoxNovelTracker()

while True:
    url = f"https://boxnovel.com/page/{page_no}/"
    
    count = 0
    while count < MAX_TRY:
        try:
            driver.get(url)
            break
        except:
            print(f"[{count}/{MAX_TRY}] failed to fetch url ({url}) - retrying ", end="\r")
            count += 1
            time.sleep(2)
    
    if count < MAX_TRY:
        if driver.title == "Page not found – BoxNovel":
            print('exiting')
            break
            
        stories = driver.find_elements(By.CLASS_NAME, "item-summary")
        if len(stories) > 0:
            for s in stories:
                atag = s.find_element(By.TAG_NAME, "a")
                name = atag.text
                url  = atag.get_attribute("href")
                if not tracker.exists(name):
                    tracker.save(name, url)
        
        else:
            print('no stroy found at ', url)
    
    else: 
        print('skipping ', url)
    page_no += 1
        

exiting


## Fetch Story Chapters

In [5]:
tracker = BoxNovelTracker()
driver1 = webdriver.Chrome(options=opt)

In [7]:
# latest one
stories = tracker.stories
c = 0
for key, story in stories.items():
    count = 0
    while count < 3:
        try   : driver.get(story["url"]); break
        except: count +=1 
    time.sleep(3)
    if count < 3:
        try: driver.find_element(By.CLASS_NAME, 'chapter-readmore').click(); time.sleep(1)
        except:pass
        chapters = driver.find_elements(By.CLASS_NAME, 'wp-manga-chapter    ')
        
        latest_record = tracker.latest_update(story['id'])
        if latest_record is None:
            print(f'no record found for {story["id"]} - fethcing all of its chapters')

            print(f'fetching {len(chapters)} for ', story["id"])
            for i, chapter in enumerate(chapters):
                try:
                    atag     = chapter.find_element(By.TAG_NAME, 'a')
                    chaplink = atag.get_attribute("href")
                    chapname = atag.text
                    rdate    = chapter.find_element(By.XPATH, "//span[@class='chapter-release-date']").text
                    driver1.get(chaplink)
                    time.sleep(1)
                    content      = driver1.find_element(By.CLASS_NAME, "entry-content")
                    chapter_text = content.find_element(By.CLASS_NAME, "text-left")

                    ps           = chapter_text.find_elements(By.TAG_NAME, "p")
                    ps_text      = "\n".join( [p.text for p in ps] )

                    tracker.save_storydata(story["id"], chapname, rdate, ps_text)

                except:print('failed to fetch chapter ', i)
                print(f'[{i}/{len(chapters)}]', end='\r')
        else:
            print(f'chapters found for {story["id"]} - fetching its latest chapters')
            saved_chapters  = [c['chaptername'] for c in tracker.get_existing_chapters( story["id"] ) ]
            new_chapters    = [ (chapter.find_element(By.TAG_NAME, 'a').get_attribute("href"),
                                 chapter.find_element(By.TAG_NAME, 'a').text,
                                 chapter.find_element(By.XPATH, "//span[@class='chapter-release-date']").text)
                              for chapter in chapters]
            chapters_to_save= [c for c in new_chapters if c[1] not in saved_chapters]
            print(f'fetching {len(chapters_to_save)} new chapters for ', story["id"])
            for i, chap in enumerate(chapters_to_save):
                try:
                    driver1.get(chap[0])
                    time.sleep(1)
                    content      = driver1.find_element(By.CLASS_NAME, "entry-content")
                    chapter_text = content.find_element(By.CLASS_NAME, "text-left")

                    ps           = chapter_text.find_elements(By.TAG_NAME, "p")
                    ps_text      = "\n".join( [p.text for p in ps] )

                    tracker.save_storydata(story["id"], chap[1], chap[2], ps_text)
                except:pass
                print(f'[{i}/{len(chapters_to_save)}]', end='\r')
    else:
        print('failed to load ', story['url'], ' - skipping it')
    
    
    c+=1
    if c == 10:
        break
    

chapters found for 1 - fetching its latest chapters
fetching 5 new chapters for  1
chapters found for 2 - fetching its latest chapters
fetching 11 new chapters for  2
chapters found for 3 - fetching its latest chapters
fetching 8 new chapters for  3
chapters found for 4 - fetching its latest chapters
fetching 8 new chapters for  4
chapters found for 5 - fetching its latest chapters
fetching 14 new chapters for  5
chapters found for 6 - fetching its latest chapters
fetching 9 new chapters for  6
chapters found for 7 - fetching its latest chapters
fetching 19 new chapters for  7
no record found for 8 - fethcing all of its chapters
fetching 251 for  8
chapters found for 9 - fetching its latest chapters
fetching 100 new chapters for  9
no record found for 10 - fethcing all of its chapters
fetching 71 for  10
[70/71]

In [None]:
# old one
stories = tracker.stories
c = 0
for key, story in stories.items():
    latest_record = tracker.latest_update(story['id'])
    if latest_record is None:
        print(f'no record found for {story["id"]} - fethcing its all chapters')
        
        count = 0
        while count < 3:
            try   : driver.get(story["url"]); break
            except: count +=1 
        time.sleep(3)
        if count < 3:
            try: driver.find_element(By.CLASS_NAME, 'chapter-readmore').click(); time.sleep(1)
            except:pass
            chapters = driver.find_elements(By.CLASS_NAME, 'wp-manga-chapter    ')
            print(f'fetching {len(chapters)} for ', story["id"])
            for i, chapter in enumerate(chapters):
                try:
                    atag     = chapter.find_element(By.TAG_NAME, 'a')
                    chaplink = atag.get_attribute("href")
                    chapname = atag.text
                    rdate    = chapter.find_element(By.XPATH, "//span[@class='chapter-release-date']").text
                    driver1.get(chaplink)
                    time.sleep(1)
                    content      = driver1.find_element(By.CLASS_NAME, "entry-content")
                    chapter_text = content.find_element(By.CLASS_NAME, "text-left")

                    ps           = chapter_text.find_elements(By.TAG_NAME, "p")
                    ps_text      = "\n".join( [p.text for p in ps] )
                    
                    tracker.save_storydata(story["id"], chapname, rdate, ps_text)
                    
                except:print('failed to fetch chapter ', i)
                print(f'[{i}/{len(chapters)}]', end='\r')
        else:
            print('failed to load ', story['url'], ' - skipping it')
    
    else:
        print(f'chapters found for {story["id"]} - fetching its latest chapters')
        chapters     = tracker.get_existing_chapters(story["id"])
        new_chapters = 
    
    
    c+=1
    if c == 10:
        break
    

no record found for 1 - fethcing its all chapters
fetching 894 for  1
failed to fetch chapter  36
failed to fetch chapter  56
failed to fetch chapter  138
failed to fetch chapter  144
failed to fetch chapter  271
no record found for 2 - fethcing its all chapters
fetching 2066 for  2
failed to fetch chapter  8
failed to fetch chapter  9
failed to fetch chapter  37
failed to fetch chapter  44
failed to fetch chapter  86
failed to fetch chapter  466
failed to fetch chapter  733
failed to fetch chapter  763
[838/2066]

In [5]:
url       = "https://boxnovel.com/novel/top-tier-providence-secretly-cultivate-for-a-thousand-years/"
driver.get(url)

In [7]:
show_more = driver.find_element(By.CLASS_NAME, 'chapter-readmore')

In [8]:
show_more.click()

In [9]:
chapters = driver.find_elements(By.CLASS_NAME, 'wp-manga-chapter    ')
len(chapters)

894

In [16]:
chapter  = chapters[0]
atag     = chapter.find_element(By.TAG_NAME, 'a')
chaplink = atag.get_attribute("href")
chapname = atag.text
rdate    = chapter.find_element(By.XPATH, "//span[@class='chapter-release-date']").text

print(chapname, ' -> ', rdate)

Chapter 894 - Divine Might Great Heaven Palm, Azure Heaven Mystic  ->  6 hours ago


In [20]:
driver1.get(chaplink)

In [23]:
content      = driver1.find_element(By.CLASS_NAME, "entry-content")
chapter_text = content.find_element(By.CLASS_NAME, "text-left")

ps           = chapter_text.find_elements(By.TAG_NAME, "p")
ps_text      = "\n".join( [p.text for p in ps] )
print("length of content -> ", len(ps_text) )


length of content ->  8869


In [28]:
tracker.save_storydata(1, chapname, rdate, ps_text)

True

### Extract Stories

In [34]:
stories     = driver.find_elements_by_class_name("item-summary")
print       (len(stories) )

20


  """Entry point for launching an IPython kernel.


In [35]:
story_links = [(s.find_element_by_tag_name("a").get_attribute("href"), s.find_element_by_tag_name("a").text,
                s.find_element_by_class_name("chapter").text,
                s.find_element_by_class_name("chapter-item").find_element_by_class_name("post-on").text) for s in stories] 
print       ("story-links -> ", len(story_links) )



story-links ->  20


## Chapter Scrapping

In [None]:
for story in story_links:
    link, name, lastchap, lastupdate = story
    # extract story info
    driver.get       (link)
    time.sleep       (3)
    key              = remove_invalid_chars( name )
    
    # move to first page
    read_first       = driver.find_element_by_id("btn-read-last")
    read_first.click ()
    time.sleep       (3)
    
    # create folder to store data
    folder_name = create_folder(story_name=key)
    chap_id     = 1
    while True:
        retry = 0
        while retry < 5:
            try:
                content      = driver.find_element_by_class_name("entry-content")
                chapter_text = content.find_element_by_class_name("text-left")

                try:
                    chapter_name = chapter_text.find_element_by_tag_name("h1").text
                except:
                    chapter_name = chapter_text.find_element_by_tag_name("h3").text
                print("chapter name -> ", chapter_name)

                ps           = chapter_text.find_elements_by_tag_name("p")
                ps_text      = "\n".join( [p.text for p in ps] )
                ps_text      = chapter_name + "\n" + ps_text
                print("length of content -> ", len(ps_text) )

                save_chapter_text(folder_name, chap_id, ps_text)
                chap_id      += 1
                break
            except:
                print("failed to fetch content- trying again...")
                time.sleep(5)
                retry += 1

        retry = 0
        while retry < 3:
            try:
                footer       = driver.find_element_by_id("manga-reading-nav-foot")

                next_page    = footer.find_element_by_xpath("//div[@class='nav-links']//div[@class='nav-next ']//a[@class='btn next_page']")
                next_page.click()
                time.sleep(2)
                retry = 0
                break
            except:
                retry += 1
                time.sleep(3)

        if retry == 3:
            break
    
    db.save("story", {"key":key, "name":name, "chapters":chap_id, "lastchapter":lastchap, "lastupdated":lastupdate, "storylink":link, "websource":url})
