In [3]:
pip install requests -i https://pypi.tuna.tsinghua.edu.cn/simple

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Note: you may need to restart the kernel to use updated packages.


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

def get_movie_info(url):
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"请求出错: {e}")
        return None

def parse_movie_data(html_content):
    if not html_content:
        return []
    
    soup = BeautifulSoup(html_content, 'html.parser')
    movie_list = []
    
    for item in soup.select('div.item')[:10]:
        try:
            rank = item.select_one('div.pic em').text.strip()
            title = item.select_one('span.title').text.strip()
            rating = item.select_one('span.rating_num').text.strip()
            quote = item.select_one('span.inq')
            quote = quote.text.strip() if quote else "暂无评价"
            
            movie_list.append({
                '排名': rank,
                '片名': title,
                '评分': rating,
                '评语': quote
            })
        except AttributeError:
            continue
    
    return movie_list

def save_to_excel(data, filename='豆瓣电影Top10.xlsx'):
    df = pd.DataFrame(data)
    df.to_excel(filename, index=False, engine='openpyxl')
    print(f"数据已保存到 {filename}")

if __name__ == "__main__":
    url = 'https://movie.douban.com/top250'
    print("开始爬取豆瓣电影排行榜数据...")
    
    html_content = get_movie_info(url)
    movie_data = parse_movie_data(html_content)
    
    if movie_data:
        print(f"成功获取 {len(movie_data)} 条电影数据")
        for movie in movie_data:
            print(f"{movie['排名']}. {movie['片名']} - 评分: {movie['评分']} - 评语: {movie['评语']}")
        
        save_to_excel(movie_data)
    else:
        print("未能获取到电影数据")
    
    print("爬取完成")    

开始爬取豆瓣电影排行榜数据...
成功获取 10 条电影数据
1. 肖申克的救赎 - 评分: 9.7 - 评语: 暂无评价
2. 霸王别姬 - 评分: 9.6 - 评语: 暂无评价
3. 泰坦尼克号 - 评分: 9.5 - 评语: 暂无评价
4. 阿甘正传 - 评分: 9.5 - 评语: 暂无评价
5. 千与千寻 - 评分: 9.4 - 评语: 暂无评价
6. 美丽人生 - 评分: 9.5 - 评语: 暂无评价
7. 这个杀手不太冷 - 评分: 9.4 - 评语: 暂无评价
8. 星际穿越 - 评分: 9.4 - 评语: 暂无评价
9. 盗梦空间 - 评分: 9.4 - 评语: 暂无评价
10. 楚门的世界 - 评分: 9.4 - 评语: 暂无评价
数据已保存到 豆瓣电影Top10.xlsx
爬取完成
