# Online comic scrapper

---

For introduction of Selenium, please be kind to open [**this site**](https://www.scrapingbee.com/blog/selenium-python/)

## Import modules

`%pip freeze > requirements.txt`

In [1]:
# Web scraping
from selenium import webdriver
from selenium.webdriver.common.by import By
# Module for dropdown selector
from selenium.webdriver.support.ui import Select
# Access website
import urllib

# Data manipulation
import pandas as pd

# Regular expression
import re
import string

# File management
from pathlib import Path

# Parse JSON
import json

# Timing
import time

## Load the Edge webdriver

we don't need to download Edge webdriver since Selenium 4.6.0 [**Read more about selenium 4.6.0**]('https://www.selenium.dev/blog/2022/selenium-4-6-0-released/')

In [2]:
# Declare webdriver
driver = webdriver.Edge()

## Access the url

In [3]:
# Main url
url = 'https://readcomiconline.li/Comic/Curses'

In [4]:
# Access to main url
driver.get(url)

## Core Procedure

### 1 Get comic title

In [141]:
# Get comic title and its abbreviation
comic_title = driver.find_element(By.CLASS_NAME, 'bigChar').text
comic_abbrv = ''.join(e[0] for e in comic_title.split())

In [142]:
# Title
comic_title

'A Game Of Thrones'

In [143]:
# Abbreviation
comic_abbrv

'AGOT'

### 2 Get URLs for each chapter

In [144]:
# Element of table
table_chapter = driver.find_element(By.TAG_NAME, 'table')

In [145]:
# List of chapter name
list_chapter = reversed([{element.text: element.get_property('href')} for element in table_chapter.find_elements(By.TAG_NAME, 'a')])

In [146]:
# Show list of chapter
list_chapter

[{'A Game Of Thrones Issue #24': 'https://readcomiconline.li/Comic/A-Game-Of-Thrones/Issue-24?id=24952'},
 {'A Game Of Thrones Issue #23': 'https://readcomiconline.li/Comic/A-Game-Of-Thrones/Issue-23?id=24951'},
 {'A Game Of Thrones Issue #22': 'https://readcomiconline.li/Comic/A-Game-Of-Thrones/Issue-22?id=24950'},
 {'A Game Of Thrones Issue #21': 'https://readcomiconline.li/Comic/A-Game-Of-Thrones/Issue-21?id=24949'},
 {'A Game Of Thrones Issue #20': 'https://readcomiconline.li/Comic/A-Game-Of-Thrones/Issue-20?id=24948'},
 {'A Game Of Thrones Issue #19': 'https://readcomiconline.li/Comic/A-Game-Of-Thrones/Issue-19?id=24947'},
 {'A Game Of Thrones Issue #18': 'https://readcomiconline.li/Comic/A-Game-Of-Thrones/Issue-18?id=24946'},
 {'A Game Of Thrones Issue #17': 'https://readcomiconline.li/Comic/A-Game-Of-Thrones/Issue-17?id=24945'},
 {'A Game Of Thrones Issue #16': 'https://readcomiconline.li/Comic/A-Game-Of-Thrones/Issue-16?id=24944'},
 {'A Game Of Thrones Issue #15': 'https://read

### 3 Open each URL

In [147]:
# Access chapter page
driver.get(list(list_chapter[0].values())[0])

### 4 Switch read mode to full page

In [148]:
# Dropdown object
dropdown_obj = Select(driver.find_element(By.ID, 'selectReadType'))

In [149]:
# List of dropdown list
dropdown_values = []
options = dropdown_obj.options
for index in range(len(options)):
    dropdown_values.append(options[index].text)

In [150]:
# Dropdown values
dropdown_values

['One page', 'All pages']

In [151]:
# Select by visible text
dropdown_obj.select_by_visible_text(dropdown_values[len(dropdown_values) - 1])

### 5 Scroll down page

In [152]:
# Document scrollHeight
scroll_height = driver.execute_script('return document.documentElement.scrollHeight;')

In [153]:
# Show page height
scroll_height

48719

In [154]:
# Scroll down slowly
scroll_height = driver.execute_script('return document.documentElement.scrollHeight;')
for iters in range(0, scroll_height, 1000):
    driver.execute_script('window.scrollTo(0, {});'.format(iters))
    time.sleep(1)

### 6 Download the images

In [155]:
# Image section object
image_obj = driver.find_element(By.ID, 'divImage')

In [158]:
# List of images
list_images = [{comic_abbrv+str(i): j.get_attribute('src')} for i, j in enumerate(image_obj.find_elements(By.TAG_NAME, 'img'))]

In [159]:
# Show list of images
list_images

[{'AGOT0': 'https://2.bp.blogspot.com/-Yk0pkOjO8is/Vt1BsyH56FI/AAAAAAAALHA/0nYbsDiWYZE/s1600-Ic42/RCO001.jpg'},
 {'AGOT1': 'https://2.bp.blogspot.com/-Fk6A275d-gU/Vt1Bs2JcdNI/AAAAAAAALHA/VsnW5phktnU/s1600-Ic42/RCO002.jpg'},
 {'AGOT2': 'https://2.bp.blogspot.com/-S9JJ1KXIKz4/Vt1BswQ6a3I/AAAAAAAALHA/WJRM3y1WwjE/s1600-Ic42/RCO003.jpg'},
 {'AGOT3': 'https://2.bp.blogspot.com/-qFv7fQtKB5A/Vt1BtVo2xUI/AAAAAAAALHA/lTWdCMjd4AM/s1600-Ic42/RCO004.jpg'},
 {'AGOT4': 'https://2.bp.blogspot.com/-5LDnXJbbCE0/Vt1BtTVOBMI/AAAAAAAALHA/gfZ1odaQkkk/s1600-Ic42/RCO005.jpg'},
 {'AGOT5': 'https://2.bp.blogspot.com/-gS-a9OcX23s/Vt1BtrmHO6I/AAAAAAAALHA/G6J-cOLBolM/s1600-Ic42/RCO006_w.jpg'},
 {'AGOT6': 'https://2.bp.blogspot.com/-yicLMM2yR3M/Vt1BtkHmFzI/AAAAAAAALHA/ifPPEWMVOBo/s1600-Ic42/RCO007.jpg'},
 {'AGOT7': 'https://2.bp.blogspot.com/-khMNRG1BWu8/Vt1Bt4Jx76I/AAAAAAAALHA/d1xh5kzV4QQ/s1600-Ic42/RCO008.jpg'},
 {'AGOT8': 'https://2.bp.blogspot.com/-OH3khkzMxxk/Vt1BtzLTvVI/AAAAAAAALHA/KYtHy3rh3wA/s1600-Ic42/RCO0

In [209]:
# URL
image_url = list_images[0][list(list_images[0].keys())[0]]

# Object name
image_name = list(list_images[0].keys())[0] + os.path.splitext(image_url)[1]

In [213]:
# Download the image
urllib.request.urlretrieve(url = image_url, filename = image_name)

('AGOT0.jpg', <http.client.HTTPMessage at 0x24d6dad2250>)