In [None]:
import requests
from bs4 import BeautifulSoup
import re
import os
from urllib.request import urlretrieve
from contextlib import closing # 用于定义站内用户的Referer
import threading

In [None]:
'''
view-source:方法，就是看页面源码，并不管动态加载的内容
这里面没有图片链接，就说明图片是动态加载的。
使用JavaScript动态加载，无外乎两种方式：
    外部加载
    内部加载
外部加载就是在html页面中，以引用的形式，加载一个js，例如这样：
<script type="text/javascript" src="https://cuijiahua.com/call.js"></script>
这段代码得意思是，引用cuijiahua.com域名下的call.js文件。
内部加载就是Javascript脚本内容写在html内.
'''

In [None]:
def get_content(content_url):
    req = requests.get(url=content_url)
    req.encoding = 'utf-8'
    html = req.text
    bs = BeautifulSoup(html, 'lxml')
    list_con_li = bs.find('ul', class_='list_con_li')
    chapters = list_con_li.find_all('a')
    # print(chapters)
    
    results = [(chapter['href'], chapter['title'].replace(' ', '_'))
               for chapter in chapters]

    results = sorted(results, key=lambda x:x[1][4:])
    return results

target = 'https://www.dmzj.com/info/yaoshenji.html'
chapters = get_content(target)
print(len(chapters))

In [None]:
def get_images(chapter_item):
    for url, title in chapter_item:
        # print(title)
        # print(url)
        req = requests.get(url=url)
        bs = BeautifulSoup(req.text, 'lxml')
        script_info = bs.script
        pic_numbers = re.findall(r'\d{13,14}', str(script_info))
        try:
            suff = re.findall(r'\|(\d{5})\|', str(script_info))[0]
            pref = re.findall(r'\|(\d{4})\|', str(script_info))[0]
        except:
            print(url)
            print(title)
            exit()
        prefix = 'https://images.dmzj.com/img/chapterpic/'
        pic_urls_order = []
        for pic_number in pic_numbers:
            if len(pic_number) == 13:
                pic_number_index = pic_number + '0'
            else:
                pic_number_index = pic_number
                
            pic_url = prefix + pref + '/' + suff + '/' + pic_number + '.jpg'
            pic_urls_order.append((pic_number_index, pic_url))
        
        pic_urls_order = sorted(pic_urls_order, key=lambda x:x[0])
        
        download_header = {'Referer':url}
        if not os.path.exists(title):
            os.mkdir(title)
        for item in range(len(pic_urls_order)):
            url = pic_urls_order[item][1]
            # print(url)
            with closing(requests.get(url, headers=download_header,
                                     stream=True)) as response:
                chunk_size = 1024
                # print(response.headers)
                content_size = int(response.headers['content-length'])
                if response.status_code == 200:
                    with open(title + '/' + str(item) + '.jpg', 'wb') as file:
                        # print('file size: %.2f KB' % (content_size / chunk_size))
                        for data in response.iter_content(chunk_size=chunk_size):
                            file.write(data)
                else:
                    print('link error')
                    print(response.status_code)
                    return 1
        
    return 0
            

# get_images(chapters[:1])

In [None]:
'''
————————————————
版权声明：本文为CSDN博主「行者小朱」的原创文章，遵循CC 4.0 BY-SA版权协议，转载请附上原文出处链接及本声明。
原文链接：https://blog.csdn.net/u012050154/article/details/80032072
'''
class MyThread(threading.Thread):
    def __init__(self, func, args=()):
        super(MyThread, self).__init__()
        self.func = func
        self.args = args
 
    def run(self):
        self.result = self.func(self.args)
 
    def get_result(self):
        threading.Thread.join(self) # 等待线程执行完毕
        try:
            return self.result
        except Exception:
            return None

In [None]:
%%time
threads_num = 10
chapter_length = len(chapters)
step = chapter_length // threads_num
print(step)
tasks = []
start = end = 0
for item in range(threads_num):
    start = end
    end += step
    if end > chapter_length:
        end = chapter_length
    task = MyThread(get_images, (chapters[start:end]))
    task.start()
    tasks.append(task)
    

for task in tasks:
    print(task.get_result(), end='\t')

In [None]:
# copy from https://blog.csdn.net/rankun1/article/details/81357179
import requests
from bs4 import BeautifulSoup
from lxml import etree
 
# 通过find定位标签
# BeautifulSoup文档：https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html
def bs_parse_movies(html):
    movie_list = []
    soup = BeautifulSoup(html, "lxml")
    # 查找所有class属性为hd的div标签
    div_list = soup.find_all('div', class_='hd')
    # 获取每个div中的a中的span（第一个），并获取其文本
    for each in div_list:
        movie = each.a.span.text.strip()
        movie_list.append(movie)
 
    return movie_list
 
# css选择器定位标签
# 更多ccs选择器语法：http://www.w3school.com.cn/cssref/css_selectors.asp
# 注意：BeautifulSoup并不是每个语法都支持
def bs_css_parse_movies(html):
    movie_list = []
    soup = BeautifulSoup(html, "lxml")
    # 查找所有class属性为hd的div标签下的a标签的第一个span标签
    div_list = soup.select('div.hd > a > span:nth-of-type(1)')
    # 获取每个span的文本
    for each in div_list:
        movie = each.text.strip()
        movie_list.append(movie)
 
    return movie_list
 
# XPATH定位标签
# 更多xpath语法：https://blog.csdn.net/gongbing798930123/article/details/78955597
def xpath_parse_movies(html):
    et_html = etree.HTML(html)
    # 查找所有class属性为hd的div标签下的a标签的第一个span标签
    urls = et_html.xpath("//div[@class='hd']/a/span[1]")
 
    movie_list = []
    # 获取每个span的文本
    for each in urls:
        movie = each.text.strip()
        movie_list.append(movie)
 
    return movie_list
 
def get_movies():
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
        'Host': 'movie.douban.com'
    }
 
    link = 'https://movie.douban.com/top250'
    r = requests.get(link, headers=headers, timeout=10)
    print("响应状态码:", r.status_code)
    if 200 != r.status_code:
        return None
 
    # 三种定位元素的方式：
 
    # 普通BeautifulSoup find
    return bs_parse_movies(r.text)
    # BeautifulSoup css select
    return bs_css_parse_movies(r.text)
    # xpath
    return xpath_parse_movies(r.text)
 
movies = get_movies()
print(movies)