-
-
Notifications
You must be signed in to change notification settings - Fork 65
/
douban_top_250_movies.py
111 lines (99 loc) · 3.73 KB
/
douban_top_250_movies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python
# encoding: utf-8
'''
#-------------------------------------------------------------------
# CONFIDENTIAL --- CUSTOM STUDIOS
#-------------------------------------------------------------------
#
# @Project Name : 豆瓣最受欢迎的250部电影
#
# @File Name : douban_top_250_movies.py
#
# @Programmer : tinygeeker
#
# @Start Date : 2022/01/10 13:14
#
# @Last Update : 2022/01/10 13:14
#
#-------------------------------------------------------------------
'''
import requests, xlwt
from bs4 import BeautifulSoup
class douban:
def __init__(self):
'''
This is a main Class, the file contains all documents.
One document contains paragraphs that have several sentences
It loads the original file and converts the original file to new content
Then the new content will be saved by this class
'''
self.url = 'https://movie.douban.com/top250'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/88.0.4324.146 Safari/537.36',
}
self.book = xlwt.Workbook(encoding='utf-8', style_compression=0)
self.row = 1
def hello(self):
'''
This is a welcome speech
:return: self
'''
print('*' * 50)
print(' ' * 15 + '豆瓣最受欢迎的250部电影')
print(' ' * 5 + '作者: tinygeeker Date: 2022-01-10 13:14')
print(' ' * 5 + '主页: https://tinygeeker.blog.csdn.net')
print('*' * 50)
return self
def request(self, url):
'''
Send a request
'''
try:
response = requests.get(url=url, headers=self.headers)
if response.status_code == 200:
return response.text
except requests.RequestException:
return None
def save_to_excel(self, sheet, soup):
'''
Save info to excel
'''
list = soup.find(class_='grid_view').find_all('li')
for item in list:
item_name = item.find(class_='title').string
item_img = item.find('a').find('img').get('src')
item_index = item.find(class_='').string
item_score = item.find(class_='rating_num').string
item_author = item.find('p').text
if item.find(class_='inq') is not None:
item_intro = item.find(class_='inq').string
else:
item_intro = 'NOT AVAILABLE'
print('爬取电影:' + item_index + ' | ' + item_name + ' | ' + item_score + ' | ' + item_intro)
sheet.write(self.row, 0, item_name)
sheet.write(self.row, 1, item_img)
sheet.write(self.row, 2, item_index)
sheet.write(self.row, 3, item_score)
sheet.write(self.row, 4, item_author)
sheet.write(self.row, 5, item_intro)
self.row += 1
def run(self):
'''
program entry
'''
sheet = self.book.add_sheet('豆瓣电影Top250', cell_overwrite_ok=True)
sheet.write(0, 0, '名称')
sheet.write(0, 1, '图片')
sheet.write(0, 2, '排名')
sheet.write(0, 3, '评分')
sheet.write(0, 4, '作者')
sheet.write(0, 5, '简介')
for i in range(0, 10):
target_url = self.url + '?start=' + str(i * 25) + '&filter='
html = self.request(target_url)
soup = BeautifulSoup(html, 'lxml')
self.save_to_excel(sheet, soup)
self.book.save(u'豆瓣最受欢迎的250部电影.xlsx')
if __name__ == '__main__':
douban().hello().run()