-
Notifications
You must be signed in to change notification settings - Fork 1
/
1_1.py
30 lines (26 loc) · 1.12 KB
/
1_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import re
import csv
from urllib import request
from bs4 import BeautifulSoup
categories = []
page_url = "http://www.healthcaremagic.com/community"
page = request.urlopen(page_url).read().decode("utf-8", "ignore")
soup = BeautifulSoup(page, "html.parser")
div = soup.find("div", class_="linePadding7")
categories_a = div.find_all("a", class_="questionTitle")
for category_a in categories_a:
title = category_a.string.strip()
url = "http://www.healthcaremagic.com" + category_a.get('href')
cat_id = url.split("/")[-1]
categories.append([title, url, cat_id])
#this segment of code retrives the number of questions asked in the category
categories_span = div.find_all("span", style="display: block;font-size:10px; color:#999;")
for i, category_span in enumerate(categories_span):
questions = category_span.string.strip()
number = re.search(r'\d+',questions).group()
categories[i].append(number)
#this code creates a file categories.csv and stores all the information retrived
write = csv.writer(open('categories.csv', 'w', encoding='utf-8'), delimiter=",")
for category in categories:
write.writerow(category)
quit()