import statements

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.common.exceptions import NoSuchElementException
import requests
import os
from PIL import Image
import re
import shutil
from tenacity import retry, stop_after_attempt, wait_fixed, RetryError
from datetime import datetime
import logging
import sys
import time

image processing functions

In [None]:
def download_image(img_url, i):
    try:
        response = get_image_from_url(img_url)
    except:
        logging.error(f"Unsuccessful download of image {i}: {img_url}")
        pil_image = Image.open("no-image-placeholder.jpg")
    else:
        image_name = f"image_{i}.jpg"
        with open(os.path.join(images_folder, image_name), "wb") as image:
            image_download = response.content
            image.write(image_download)
        pil_image = Image.open(os.path.join(images_folder, image_name))
    finally:
        return pil_image

def get_all_image_files(img_srcs):
    i = 1
    image_files = []
    for img_url in img_srcs:
        image_file = download_image(img_url, i)
        image_files.append(image_file)
        i += 1
    return image_files

captcha related functions

In [None]:
def is_captcha_page(driver):
    try:
        captcha_form = driver.find_element(By.ID, "formVerify")
    except:
        print("No CAPTCHA on this page")
        return False
    else:
        print("CAPTCHA found on this page")
        print(f"CAPTCHA URL: {driver.current_url}")
        logging.warning("CAPTCHA found on this page")
        logging.warning(f"CAPTCHA URL: {driver.current_url}")
        return True

@retry(stop=stop_after_attempt(5),)
def solve_captcha(captcha_url):
    captcha_driver = initialize_driver(False)
    print("Starting captcha driver")
    try:
        captcha_driver.get(captcha_url)
        logging.info("60 seconds to complete the CAPTCHA")
        print("60 seconds to complete CAPTCHA")
        time.sleep(60)
        if is_captcha_page(captcha_driver):
            logging.warning("CAPTCHA not complete")
            raise Exception("CAPTCHA not complete")
        else:
            print("No CAPTCHA or it's solved")
            logging.info("No CAPTCHA or it's solved")
    finally:
        print("Closing captcha driver")
        captcha_driver.quit()

driver-related functions

In [None]:
# Initializes a Selenium webdriver
def initialize_driver(headless=True):
    options = webdriver.FirefoxOptions()
    if headless == True:
        options.add_argument("-headless")
    driver = webdriver.Firefox(options=options)
    return driver

# Gets the series name
def get_series_title(driver):
    try:
        title_element = driver.find_element(By.CLASS_NAME, "bigChar")
    except NoSuchElementException as e:
        logging.error("Series title not found!")
    else:
        series_title = title_element.text
        series_title = make_valid_filename(series_title)
        return series_title

# Gets all a elements/comic book issues from a series
def get_anchor_elements(driver):
    try:
        table_element = driver.find_element(By.CLASS_NAME, "listing")
    except NoSuchElementException as e:
        logging.error("No comic book issues (volumes) found!")
    else:
        anchor_elements = table_element.find_elements(By.TAG_NAME, "a")
        return anchor_elements
    
# Gets all img elements from the page
def get_image_elements(driver):
    try:
        select_element_page = driver.find_element(By.ID, "selectReadType")
    except NoSuchElementException as e:
        logging.error("Cannot access comic book issue page.")
    else:
        select = Select(select_element_page)
        select.select_by_value("1")
        div_image = driver.find_element(By.ID, "divImage")
        img_elements = div_image.find_elements(By.TAG_NAME, "img")
        return img_elements

misc functions

In [None]:
def make_valid_filename(filename):
    pattern = r'[<>:"\/|?*]'
    # Use re.sub() to replace all occurrences of the special characters with an empty string
    filename = re.sub(pattern, '', filename)
    return filename

@retry(stop=stop_after_attempt(10), wait=wait_fixed(1))
def get_image_from_url(img_url):
    return requests.get(img_url)

Initialize logging

In [None]:
logger = logging.getLogger(__name__)
current_datetime = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
logging.basicConfig(filename="logfile.log", encoding='utf-8', 
                    level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

logging.getLogger('selenium').setLevel(logging.WARNING)
logging.getLogger('selenium.webdriver').setLevel(logging.WARNING)
logging.getLogger('selenium.webdriver.common').setLevel(logging.WARNING)
logging.getLogger('requests').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('PIL').setLevel(logging.WARNING)

URL

In [None]:
# Paste the URL of a comic series from:
# https://readcomiconline.li/
url = "https://readcomiconline.li/Comic/Batman-Off-World".rstrip("/")
# url = "https://readcomiconline.li/Special/AreYouHuman?reUrl=%2fComic%2fBatman-Off-World"
# url = "https://readcomiconline.li/Special/AreYouHuman?reUrl=%2fComic%2fBatman-Off-World%2fIssue-1%3fid%3d222020"
logging.info(f"URL: {url}")

Series page

In [None]:
driver = initialize_driver()
driver.get(url)
print(f"Outside driver: {driver}")

def handles_captcha(driver):
    if is_captcha_page(driver):
        captcha_url = driver.current_url
        try:
            solve_captcha(captcha_url)
        except RetryError:
            print("captcha solve timeout")
            logging.error("CAPTCHA timeout! Time limit exceeded.")
            logging.critical("Program forcefully exited! Rerun the program again.")
            sys.exit()
    else:
        print("no more captcha or ISNT ONE")

# Get the series title
handles_captcha(driver)
# Refreshes the driver pre-reroute to captcha
driver.get(url)
print(f"Main driver URL: {driver.current_url}")
try:
    comic_series = get_series_title(driver)
    logging.info(f"Comic Series: '{comic_series}'")
except Exception as e:
    print(e)
print(comic_series)

# Get all issues URLs in the series
handles_captcha(driver)
try:
    anchor_elements = get_anchor_elements(driver)
except Exception as e:
    print(e)
else:
    issues = [{elem.text: elem.get_attribute("href")} for elem in anchor_elements]
    issues.reverse()
    logging.info(f"Found {len(issues)} issues total.")
driver.quit()

Create local series folder

In [None]:
comic_series_folder = f"{comic_series}"
try:
    os.makedirs(comic_series_folder)
    # logging.info(f"Created {comic_series_folder}.")
    print(f"Created {comic_series_folder}.")
    comic_series_path = os.path.join(comic_series_folder)
except OSError as e:
    comic_series_path = comic_series_folder
    logging.warning(f"{comic_series_folder} already exists.")

Process each series issue

In [None]:
for issue in issues[:]:
    for title, url in issue.items():
        title = make_valid_filename(title)
        
        # If the issue.pdf exists, skip it
        if os.path.exists(f"{comic_series_folder}/{title}.pdf"):
            logging.info(f"'{title}.pdf' already exists.")
        # Else, create it
        else:
            issue_folder = os.path.join(comic_series_path, title)
            images_folder = os.path.join(issue_folder, "images")
            try:
                os.makedirs(images_folder)
                logging.info(f"Created '{issue_folder}'.")
            except OSError as e:
                logging.warning(f"'{issue_folder}' already exists.")

            print("Trying", title)
            logging.info(f"Working on '{title}'")

            driver = initialize_driver()
            driver.get(url)

            handles_captcha(driver)
            driver.get(url)
            try:
                img_elements = get_image_elements(driver)
            except Exception as e:
                print(e)
            else:
                img_srcs = [elem.get_attribute("src") for elem in img_elements]
                print("Images found:", len(img_srcs))
            driver.quit()

            image_files = get_all_image_files(img_srcs)

            image_files[0].save(f"{comic_series_folder}/{title}.pdf", save_all=True, append_images=image_files[1:])
            logging.info(f"Created '{title}.pdf'")
            try:
                shutil.rmtree(issue_folder)
                print(f"Folder '{issue_folder}' successfully deleted.")
                # logging.info(f"'{issue_folder}' deleted.")

            except OSError as e:
                print(f"Folder '{issue_folder}' already deleted.")
                # logging.warning(f"'{issue_folder}' already deleted.")

            print("Finished", title)

Gracefully close/handle logging

In [None]:
logging.info(f"Completed Series: '{comic_series}'\n")
logging.shutdown()
os.rename("logfile.log", comic_series_path+"/"+current_datetime+".log")