-
Notifications
You must be signed in to change notification settings - Fork 0
/
web.py
39 lines (35 loc) · 1.17 KB
/
web.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from turtle import title
import requests
from bs4 import BeautifulSoup
from newspaper import Article
import pandas as pd
from tqdm import tqdm
def scrapAll():
page = 2
scraptedData = []
pagesNum = input('number of pages:')
fileName = input('csv file name(dont enter .csv):')
while True:
page_url = f'https://www.tabnak.ir/fa/archive?service_id=-1&sec_id=-1&cat_id=-1&rpp=20&from_date=1384/01/01&to_date=1401/03/24&p={page}'
html = requests.get(page_url).text
soap = BeautifulSoup(html, 'lxml')
links = soap.findAll("div", {"class": "linear_news"})
page += 1
for link in tqdm(links):
news_url = 'https://www.tabnak.ir'+link.a['href']
try:
article = Article(news_url)
article.download()
article.parse()
except:
print('download failed')
scraptedData.append({
'url': news_url,
'text': article.text,
'title': article.title
})
if page == int(pagesNum) + 1:
break
df = pd.DataFrame(scraptedData)
df.to_csv(f'{fileName}.csv')
scrapAll()