In [3]:
pip install requests -i https://pypi.tuna.tsinghua.edu.cn/simple

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Note: you may need to restart the kernel to use updated packages.


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

def get_movie_info(url):
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"请求出错: {e}")
        return None

def parse_movie_data(html_content):
    if not html_content:
        return []
    
    soup = BeautifulSoup(html_content, 'html.parser')
    movie_list = []
    
    for item in soup.select('div.item')[:10]:
        try:
            rank = item.select_one('div.pic em').text.strip()
            title = item.select_one('span.title').text.strip()
            rating = item.select_one('span.rating_num').text.strip()
            quote = item.select_one('span.inq')
            quote = quote.text.strip() if quote else "暂无评价"
            
            movie_list.append({
                '排名': rank,
                '片名': title,
                '评分': rating,
                '评语': quote
            })
        except AttributeError:
            continue
    
    return movie_list

def save_to_excel(data, filename='豆瓣电影Top10.xlsx'):
    df = pd.DataFrame(data)
    df.to_excel(filename, index=False, engine='openpyxl')
    print(f"数据已保存到 {filename}")

if __name__ == "__main__":
    url = 'https://movie.douban.com/top250'
    print("开始爬取豆瓣电影排行榜数据...")
    
    html_content = get_movie_info(url)
    movie_data = parse_movie_data(html_content)
    
    if movie_data:
        print(f"成功获取 {len(movie_data)} 条电影数据")
        for movie in movie_data:
            print(f"{movie['排名']}. {movie['片名']} - 评分: {movie['评分']} - 评语: {movie['评语']}")
        
        save_to_excel(movie_data)
    else:
        print("未能获取到电影数据")
    
    print("爬取完成")    

开始爬取豆瓣电影排行榜数据...
成功获取 10 条电影数据
1. 肖申克的救赎 - 评分: 9.7 - 评语: 暂无评价
2. 霸王别姬 - 评分: 9.6 - 评语: 暂无评价
3. 泰坦尼克号 - 评分: 9.5 - 评语: 暂无评价
4. 阿甘正传 - 评分: 9.5 - 评语: 暂无评价
5. 千与千寻 - 评分: 9.4 - 评语: 暂无评价
6. 美丽人生 - 评分: 9.5 - 评语: 暂无评价
7. 这个杀手不太冷 - 评分: 9.4 - 评语: 暂无评价
8. 星际穿越 - 评分: 9.4 - 评语: 暂无评价
9. 盗梦空间 - 评分: 9.4 - 评语: 暂无评价
10. 楚门的世界 - 评分: 9.4 - 评语: 暂无评价
数据已保存到 豆瓣电影Top10.xlsx
爬取完成


In [7]:
pip install lxml -i https://pypi.tuna.tsinghua.edu.cn/simple

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Note: you may need to restart the kernel to use updated packages.


In [23]:
import requests
from lxml import etree
import os
import time

def download_images():
    # 设置请求头，模拟浏览器访问
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    base_url = "http://pic.netbian.com/"
    
    try:
        # 发送请求获取网页内容
        response = requests.get(base_url, headers=headers)
        response.encoding = "gbk"
        html_content = response.text
        
        # 解析HTML内容
        tree = etree.HTML(html_content)
        
        # 提取图片URL列表
        img_urls = tree.xpath("//ul[@class='clearfix']/li/a/span/img/@src")
        
        if not img_urls:
            print("未找到图片URL")
            return
        
        print(f"找到 {len(img_urls)} 张图片")
        
        # 创建保存目录
        save_dir = "d:/images"
        os.makedirs(save_dir, exist_ok=True)
        
        # 下载每张图片
        for i, img_path in enumerate(img_urls):
            # 处理相对URL
            if img_path.startswith('/'):
                img_url = base_url + img_path[1:]  # 移除开头的斜杠
            else:
                img_url = base_url + img_path
            
            try:
                # 下载图片
                print(f"正在下载第 {i+1}/{len(img_urls)} 张图片: {img_url}")
                img_response = requests.get(img_url, headers=headers, timeout=10)
                img_response.raise_for_status()
                
                # 确定文件扩展名
                file_ext = os.path.splitext(img_url)[1] or '.jpg'
                
                # 保存图片
                save_path = os.path.join(save_dir, f"image_{i}{file_ext}")
                with open(save_path, 'wb') as f:
                    f.write(img_response.content)
                
                print(f"图片已保存至: {save_path}")
                
                # 添加延时，避免频繁请求
                time.sleep(0.5)
                
            except requests.exceptions.RequestException as e:
                print(f"下载图片失败: {e}")
                continue
        
        print(f"全部下载完成，共下载 {len(img_urls)} 张图片")
        
    except Exception as e:
        print(f"发生错误: {e}")

if __name__ == "__main__":
    download_images()    

找到 20 张图片
正在下载第 1/20 张图片: http://pic.netbian.com/uploads/allimg/250627/202545-175102714571ed.jpg
图片已保存至: d:/images\image_0.jpg
正在下载第 2/20 张图片: http://pic.netbian.com/uploads/allimg/250626/231909-17509511496e7b.jpg
图片已保存至: d:/images\image_1.jpg
正在下载第 3/20 张图片: http://pic.netbian.com/uploads/allimg/250627/172845-1751016525523a.jpg
图片已保存至: d:/images\image_2.jpg
正在下载第 4/20 张图片: http://pic.netbian.com/uploads/allimg/250627/170634-17510151946096.jpg
图片已保存至: d:/images\image_3.jpg
正在下载第 5/20 张图片: http://pic.netbian.com/uploads/allimg/250627/105030-17509926303a62.jpg
图片已保存至: d:/images\image_4.jpg
正在下载第 6/20 张图片: http://pic.netbian.com/uploads/allimg/250626/110455-1750907095c440.jpg
图片已保存至: d:/images\image_5.jpg
正在下载第 7/20 张图片: http://pic.netbian.com/uploads/allimg/250625/233227-17508655473d8e.jpg
图片已保存至: d:/images\image_6.jpg
正在下载第 8/20 张图片: http://pic.netbian.com/uploads/allimg/250614/201508-17499033082071.jpg
图片已保存至: d:/images\image_7.jpg
正在下载第 9/20 张图片: http://pic.netbian.com/uploads/allimg/