In [40]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.common.exceptions import NoSuchElementException
import requests
import os
from PIL import Image
import re
import shutil
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_result, RetryError
import sys
from datetime import datetime
import logging

In [41]:
# possible optimizations?
# multithreading?
# async functions to get the images from each issue

In [42]:
logger = logging.getLogger(__name__)
current_datetime = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
logging.basicConfig(filename="logfile.log", encoding='utf-8', 
                    level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

logging.getLogger('selenium').setLevel(logging.WARNING)
logging.getLogger('selenium.webdriver').setLevel(logging.WARNING)
logging.getLogger('selenium.webdriver.common').setLevel(logging.WARNING)
logging.getLogger('requests').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('PIL').setLevel(logging.WARNING)


In [43]:
# Paste the URL of a comic series from:
# https://readcomiconline.li/
url = "https://readcomiconline.li/Comic/Batman-Off-World".rstrip("/")
logging.info(f"URL: {url}")
parsed_url = url.split("/")
# comic series should instead be taken from the p tag on the page. for when the series page starts with captcha DONE
# check if the page is a captcha DONE
# fix the logging for retries DONT KNOW IF IT WORKS FOR SURE, WILL SEE WHEN CAPTCHA IS FOUND
# name the log file (datetime.log) DONE
# move log file into the series folder DONE


In [44]:
def make_valid_filename(filename):
    pattern = r'[<>:"\/|?*]'
    # Use re.sub() to replace all occurrences of the special characters with an empty string
    filename = re.sub(pattern, '', filename)
    return filename

In [45]:
# Checks if the loaded page is a captcha
def is_captcha_page(driver):
    try:
        captcha_form = driver.find_element(By.ID, "formVerify")
    except NoSuchElementException as e:
        print("No captcha on this page")
        return False
    else:
        print(driver.current_url)
        logging.warning("CAPTCHA found on this page")
        logging.warning(f"CAPTCHA URL: {driver.current_url}")
        return True

In [46]:
@retry(
    retry=retry_if_result(lambda x: is_captcha_page(driver)),
    stop=stop_after_attempt(20),
    wait=wait_fixed(15),
)
# Gets the series name
def get_series_title(driver):
    try:
        title_element = driver.find_element(By.CLASS_NAME, "bigChar")
    except NoSuchElementException as e:
        logging.error("Series title not found!")
    else:
        series_title = title_element.text
        series_title = make_valid_filename(series_title)
        return series_title

In [47]:
@retry(
    retry=retry_if_result(lambda x: is_captcha_page(driver)),
    stop=stop_after_attempt(20),
    wait=wait_fixed(15),
)
# Gets all a elements/comic book issues from a series
def get_anchor_elements(driver):
    try:
        table_element = driver.find_element(By.CLASS_NAME, "listing")
    except NoSuchElementException as e:
        logging.error("No comic book issues (volumes) found!")
    else:
        anchor_elements = table_element.find_elements(By.TAG_NAME, "a")
        return anchor_elements

In [48]:
options = webdriver.FirefoxOptions()
# options = options.add_argument("-headless")
options.add_argument("-headless")
driver = webdriver.Firefox(options=options)
driver.get(url)

# Get the series title
try:
    comic_series = get_series_title(driver)
    logging.info(f"Comic Series: '{comic_series}'")
except RetryError:
    logging.error("CAPTCHA timeout! Time limit exceeded.")
    logging.critical("Program forcefully exited! Rerun the program again.")
    sys.exit()

# Get all issues URLs in the series
try:
    anchor_elements = get_anchor_elements(driver)
except RetryError:
    logging.error("CAPTCHA timeout! Time limit exceeded.")
    logging.critical("Program forcefully exited! Rerun the program again.")
    sys.exit()
else:
    issues = [{elem.text: elem.get_attribute("href")} for elem in anchor_elements]
    issues.reverse()
    logging.info(f"Found {len(issues)} issues total.")
driver.quit()

No captcha on this page
No captcha on this page


In [49]:
comic_series_folder = f"{comic_series}"
try:
    os.makedirs(comic_series_folder)
    # logging.info(f"Created {comic_series_folder}.")
    print(f"Created {comic_series_folder}.")
    comic_series_path = os.path.join(comic_series_folder)
except OSError as e:
    comic_series_path = comic_series_folder
    logging.warning(f"{comic_series_folder} already exists.")

In [50]:
@retry(
        stop=stop_after_attempt(10),
      )
def get_image_from_url(img_url):
    return requests.get(img_url)

In [51]:
@retry(
    retry=retry_if_result(lambda x: is_captcha_page(driver)),
    stop=stop_after_attempt(20),
    wait=wait_fixed(15),
    )
# Gets all img elements from the page
def get_image_elements(driver):
    try:
        select_element_page = driver.find_element(By.ID, "selectReadType")
    except NoSuchElementException as e:
        logging.error("Cannot access comic book issue page.")
    else:
      select = Select(select_element_page)
      select.select_by_value("1")
      div_image = driver.find_element(By.ID, "divImage")
      img_elements = div_image.find_elements(By.TAG_NAME, "img")
    return img_elements

In [52]:
for issue in issues[:]:
    for title, url in issue.items():
        title = make_valid_filename(title)
        
        # If the issue.pdf exists, skip it
        if os.path.exists(f"{comic_series_folder}/{title}.pdf"):
            logging.info(f"'{title}.pdf' already exists.")
        # Else, create it
        else:
            issue_folder = os.path.join(comic_series_path, title)
            images_folder = os.path.join(issue_folder, "images")
            try:
                os.makedirs(images_folder)
                logging.info(f"Created '{issue_folder}'.")
            except OSError as e:
                logging.warning(f"'{issue_folder}' already exists.")

            print("Trying", title)
            logging.info(f"Working on '{title}'")

            options = webdriver.FirefoxOptions()
            # options = options.add_argument("-headless")
            options.add_argument("-headless")
            driver = webdriver.Firefox(options=options)
            driver.get(url)

            try:
                img_elements = get_image_elements(driver)
            except RetryError:
                logging.error("CAPTCHA timeout! Time limit exceeded.")
                logging.critical("Program forcefully exited! Rerun the program again.")
                sys.exit()
            else:
                img_srcs = [elem.get_attribute("src") for elem in img_elements]
                print("Images found:", len(img_srcs))
            driver.quit()

            image_files = []
            i = 1
            successful_img_downloads = 0
            for img_url in img_srcs:
                try:
                    response = get_image_from_url(img_url)
                except:
                    logging.error(f"{title}: image {i} of {len(img_srcs)}")
                    logging.error(f"Unsuccessful download of image: {img_url}")
                    pil_image = Image.open("no-image-placeholder.jpg")
                else:
                    image_name = f"image_{i}.jpg"
                    with open(os.path.join(images_folder, image_name), "wb") as image:
                        image_download = response.content
                        image.write(image_download)
                    pil_image = Image.open(os.path.join(images_folder, image_name))
                    successful_img_downloads += 1
                finally:
                    image_files.append(pil_image)
                i += 1
            # print("Total successful image downloads", successful_img_downloads, "out of", len(img_srcs))
            logging.info(f"'{title}': {successful_img_downloads}/{len(img_srcs)} images downloaded")

            image_files[0].save(f"{comic_series_folder}/{title}.pdf", save_all=True, append_images=image_files[1:])
            logging.info(f"Created '{title}.pdf'")
            try:
                shutil.rmtree(issue_folder)
                print(f"Folder '{issue_folder}' successfully deleted.")
                # logging.info(f"'{issue_folder}' deleted.")

            except OSError as e:
                print(f"Folder '{issue_folder}' already deleted.")
                # logging.warning(f"'{issue_folder}' already deleted.")

            print("Finished", title)

logging.info(f"Completed Series: '{comic_series}'\n")
logging.shutdown()
os.rename("logfile.log", comic_series_path+"/"+current_datetime+".log")