/
scrap-cap-ru.py
74 lines (60 loc) · 2.29 KB
/
scrap-cap-ru.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import pandas as pd
import urllib3.request
from bs4 import BeautifulSoup
import re
import requests
BASE_URL='http://www.cap.ru/news/'
proxy = {'http':'188.0.168.205:8080','http':'178.213.145.24:8080'}
def get_html(url):
#response = urllib.request.urlopen(url)
response = requests.get(url)
#return response.read()
return response.text
def get_page_count(html):
soup = BeautifulSoup(html)
last = soup.find('div', class_='PagerLast')
return int(last.text.strip())
def parse(html):
soup = BeautifulSoup(html)
#soup = BeautifulSoup(response, 'html.parser')
table = soup.find('div', class_='main_news')
#rows = soup.findAll('div', 'main_news')
topics = table.findAll('div', class_='news_title')
date = table.findAll('div', class_='doc_date')
#time = re.findall(r'\d\d\:+\d\d',str(table))
dates = re.findall(r'\d+\s\w+\s\d+\sã.',str(date))
time = re.findall(r'\d\d\:+\d\d',str(date))
z=1 #ñ÷¸ò÷èê íîâîñòåé
news=[]
for i in topics:
#print("¹"+str(z), '=> ' + i.text.strip())
#dates = date[z-1].text.strip()
#dd = re.findall(r'\d+\s\w+\s\d+\sã.',dates)
#time = re.findall(r'\d\d\:+\d\d',str(dates))
#print(z, dates, dd, type(dd)) # âûâîäèì äàòó íà ïå÷àòü
#print(dates, re.findall(r'\d+\s\w+\s\d+\sã.',dates)[0])
z+=1
news.append(i.text.strip())
#df = pd.DataFrame({'time':time,'date':date, 'topics':list(all_topics)})
df = pd.DataFrame({'topics':list(news), 'dates':list(dates), 'times':list(time)})
return df
def myip():
response = requests.get('http://pr-cy.ru/browser-details/', proxies = proxy)
soup = BeautifulSoup(response.text)
return soup.html.find('div', class_='ip').text
def main():
print('Ìîé ip:',myip())
#parse(get_html(BASE_URL))
#print(get_page_count(get_html(BASE_URL)))
pages = get_page_count(get_html(BASE_URL))
print ("Íàéäåíî:", pages,"ñòðàíèö")
allnews = pd.DataFrame()
for page in range(1, 1000):
print('Ïàðñèíã %d%%' % (page / pages * 100))
html = get_html(BASE_URL + '?page=%d' % page)
allnews = allnews.append(parse(html), ignore_index = True)
#print(parse(html))
return allnews
if __name__ == '__main__':
allnews = main()
print(allnews)