# 使用Python爬取豆瓣TOP250电影榜

+ 使用requests爬取网页
+ 使用BeautifulSoup实现数据解析
+ 借助pandas将数据写出到Excel

In [11]:
import requests
from  bs4 import BeautifulSoup
import pprint
import json
import pandas as pd

## 下载共10个页面的html

In [12]:
page_indexs = range(0,250,25)

In [13]:
list(page_indexs)

[0, 25, 50, 75, 100, 125, 150, 175, 200, 225]

In [14]:
#下载所有列表页面的html，用于后续分析
def download_all_htmls():
    htmls = []
    for idx in page_indexs:
        url = f"https://movie.douban.com/top250?start={idx}&filter="
        print("craw html: {}".format(url))
 

        header = { 
        "User-Agent": """Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"""
        }    
        r = requests.get(url=url,headers=header,timeout=10)
        if r.status_code != 200:
            raise Exception("request error status code:{}".format(r.status_code))
        htmls.append(r.text)
    return htmls

In [15]:
htmls = download_all_htmls()

craw html: https://movie.douban.com/top250?start=0&filter=
craw html: https://movie.douban.com/top250?start=25&filter=
craw html: https://movie.douban.com/top250?start=50&filter=
craw html: https://movie.douban.com/top250?start=75&filter=
craw html: https://movie.douban.com/top250?start=100&filter=
craw html: https://movie.douban.com/top250?start=125&filter=
craw html: https://movie.douban.com/top250?start=150&filter=
craw html: https://movie.douban.com/top250?start=175&filter=
craw html: https://movie.douban.com/top250?start=200&filter=
craw html: https://movie.douban.com/top250?start=225&filter=


## 解析HTML得到数据

In [7]:
def parse_single_page(html):
    soup = BeautifulSoup(html, "html.parser")
    movie_list = soup.find("ol", class_="grid_view").find_all("div",class_="item")

    datas = []
    for movie in movie_list:
        rank = movie.find("em").get_text()
        title = movie.find("span",class_="title").text
        starts = movie.find("div",class_="star").find_all("span")
        score = int(starts[0]["class"][0].replace("rating","").replace("-t",""))
        rating_num = starts[1].get_text()
        comment_num = starts[3].get_text()
        datas.append({
            "rank":rank,
            "title":title,
            "score": score if score<=5 else score/10,
            "rating_num":rating_num,
            "comment_num":comment_num.replace("人评价","")
        })
    return datas



In [9]:
all_data = []
for html in htmls:
    datas = parse_single_page(html)
    all_data.extend(datas)


## 将结果存入excel

In [72]:
movie_pd=pd.DataFrame(all_data)
movie_pd.to_csv("doubanTop250.csv",index=False,encoding="utf-8-sig")