In [18]:
import multiprocessing
import pandas as pd

In [19]:
article_links = pd.read_csv('asx_article_links.csv')
print(len(article_links))
article_links.head(2)

56341


Unnamed: 0,links
0,https://www.aisixiang.com/data/149906.html
1,https://www.aisixiang.com/data/149797.html


In [20]:
# article_links = article_links.sample(10, random_state=42)

In [24]:
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.error import URLError
import re
import requests

def get_authors_and_title(text):
    num_splits = len(text.split('：'))
    # has both
    if num_splits >= 2:
        return text.split('：', 1)
    # no authors, just title
    if num_splits == 1:
        return None, text


def get_article_content(link, timeout=10):
    title = ''
    authors = ''
    date = ''
    article_text = ''
    try:
        with requests.get(link, timeout=timeout) as page:
                
                # html = page.read().decode("utf-8")
                # soup = BeautifulSoup(html, "lxml")
                soup = BeautifulSoup(page.content, "lxml")

                # get the article text
                article_content = soup.find(id='content')
                text_blocks = article_content.find_all('p', string=True)
                
                for para in text_blocks:
                    article_text += para.text.strip()

                # get the article date
                article_info = soup.find(class_='info')
                date = re.search('....-..-..', article_info.text).group(0)

                # get the authors and title
                h3_pieces = soup.find_all('h3')
                authors, title = get_authors_and_title(h3_pieces[0].text)
            
    except URLError as e:
        print(f"Error fetching links from {link}: {e}")
    except Exception as e:
        print(f"Error processing {link}: {e}")
    finally:
        # return {'title': title, 'authors': authors, 'date':date, 'url': link, 'text': article_text}
        return title, authors, date, link, article_text


In [25]:
# # split it into 1000-piece chunks
# chunk_size = 500
# for i in range(0, len(article_links), chunk_size):
#     temp_articles = article_links.iloc[i:i+chunk_size]
#     filepath = "./asx_article_links_split/split_links_" + str(i) + ".csv"
#     temp_articles.to_csv(filepath, index=False)

In [26]:
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [27]:
def scrape_articles():  
    NUM_ARTICLES = 56341
    chunk_size = 500
    for i in range(8500, NUM_ARTICLES, chunk_size):
        titles = []
        authors = []
        dates = []
        urls = []
        texts = []

        link_filepath = "./asx_article_links_split/split_links_" + str(i) + ".csv"
        print(f'Loading the file: {link_filepath}')
        links = pd.read_csv(link_filepath)

        for link in links['links']:
            title, author, date, link, text = get_article_content(link)
            titles.append(title)
            authors.append(author)
            dates.append(date)
            urls.append(link)
            texts.append(text)
            
        df = pd.DataFrame({'title': titles, 'author': authors, 'date':dates, 'url': urls, 'text':texts})
        print(f'Completed scraping articles {i}-{i + chunk_size}')
        article_filepath = "./asx_articles/articles_" + str(i) + ".csv"
        df.to_csv(article_filepath, index=False)
        print(f"Saved scraped articles")

%lprun -f scrape_articles scrape_articles()

Loading the file: ./asx_article_links_split/split_links_8500.csv
Completed scraping articles 8500-9000
Saved scraped articles
Loading the file: ./asx_article_links_split/split_links_9000.csv
Error processing https://www.aisixiang.comhttp://www.zhinong.cn/data/search.php?keyWords=%D5%D4%BF%A1%B3%BC&searchfield=author: HTTPSConnectionPool(host='www.aisixiang.comhttp', port=443): Max retries exceeded with url: //www.zhinong.cn/data/search.php?keyWords=%D5%D4%BF%A1%B3%BC&searchfield=author (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x13a135f90>: Failed to resolve 'www.aisixiang.comhttp' ([Errno 8] nodename nor servname provided, or not known)"))
Completed scraping articles 9000-9500
Saved scraped articles
Loading the file: ./asx_article_links_split/split_links_9500.csv
Completed scraping articles 9500-10000
Saved scraped articles
Loading the file: ./asx_article_links_split/split_links_10000.csv
Completed scraping articles 10000-10500
Saved scraped article

- with lxml and cchardet: 500 pages = 4 min
- 100% of time is from the get_article_content() function
- changing it so it doesn't get html first, and just uses page.contents doesn't work, since page doesn't have contents...
- everybody else seems to be using requests instead of openurl...
- after the updates, it took 10 min to do 1500

I ran the above code for 2.5 hours (151 min) and it scraped 8500-29500. So... 21,000 in 150 min. Or about 3.6 min for each batch of 500. Scrapy does a batch in like 15-30 sec, with an average probably around 18 sec? So that is about 14x faster. 

In [None]:
# check this next time, see if I can speed up some of the parsing... 
# https://thehftguy.com/2020/07/28/making-beautifulsoup-parsing-10-times-faster/ 


## Development Space

In [8]:
import pandas as pd

links = pd.read_csv('./asx_article_links_split/test_links.csv')
links = links.sample(10)['links']
links

15    https://www.aisixiang.com/data/140669.html
57    https://www.aisixiang.com/data/149722.html
12    https://www.aisixiang.com/data/149720.html
74     https://www.aisixiang.com/data/66926.html
24     https://www.aisixiang.com/data/81350.html
51    https://www.aisixiang.com/data/149760.html
50    https://www.aisixiang.com/data/149762.html
81     https://www.aisixiang.com/data/24186.html
8     https://www.aisixiang.com/data/149730.html
41    https://www.aisixiang.com/data/101839.html
Name: links, dtype: object

In [14]:
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.error import URLError
import re
import requests

def get_authors_and_title(text):
    num_splits = len(text.split('：'))
    # has both
    if num_splits >= 2:
        return text.split('：', 1)
    # no authors, just title
    if num_splits == 1:
        return None, text


def get_article_content(links, timeout=10):
    for link in links:
        title = ''
        authors = ''
        date = ''
        article_text = ''
        try:
            with requests.get(link) as page:
                
                # html = page.read().decode("utf-8")
                # soup = BeautifulSoup(html, "lxml")
                soup = BeautifulSoup(page.content, "lxml")

                # get the article text
                article_content = soup.find(id='content')
                text_blocks = article_content.find_all('p', string=True)
                
                for para in text_blocks:
                    article_text += para.text.strip()

                # get the article date
                article_info = soup.find(class_='info')
                date = re.search('....-..-..', article_info.text).group(0)

                # get the authors and title
                h3_pieces = soup.find_all('h3')
                authors, title = get_authors_and_title(h3_pieces[0].text)
                
        except URLError as e:
            print(f"Error fetching links from {link}: {e}")
        except Exception as e:
            print(f"Error processing {link}: {e}")
        finally:
            # return {'title': title, 'authors': authors, 'date':date, 'url': link, 'text': article_text}
            print({'title': title, 'authors': authors, 'date':date, 'url': link, 'text': article_text})
            # return title, authors, date, link, article_text


In [15]:
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [17]:
%lprun -f get_article_content get_article_content(links)

{'title': '教师职业违规行为处理专门立法应当厘清的若干问题', 'authors': '湛中乐 赵磊', 'date': '2023-02-12', 'url': 'https://www.aisixiang.com/data/140669.html', 'text': '摘要：目前教师职业违规行为的处理存在诸多问题，究其根本，主要在于针对教师职业违规行为的处理没有统一的高位阶处理规范，故亟待制定一部统一的高位阶行政法规或部门规章对实践中频发的教师职业违规行为处理进行指导。在立法的过程中，需要注意六个问题：一是教师及教师职业行为的范围界定；二是教师职业违规行为的认定及规定方式的确定；三是明确教师职业违规行为处理措施的类型与适用；四是设立教师职业违规行为监督委员会；五是保障教师职业违规行为处理程序的规范性与高效性；六是健全教师权利救济的多元化渠道。关键词：教育立法；教师职业行为；教师职业违规行为；教师救济基金项目：国家社科基金项目“中国教育法学总论体系构建研究”(20BFX050)党的二十大报告指出：“教育、科技、人才是全面建设社会主义现代化国家的基础性、战略性支撑。”[1]这一论述对我国教育的重要战略地位进行了确立和强调，明确把教育作为全面建设社会主义现代化国家的战略先导之一。教育的根本任务是立德树人，教师在这个过程中起着不可替代的作用。党的十八大以来，以习近平同志为核心的党中央高度重视教师队伍建设问题，尤其重视师德师风的建设，党的二十大报告再次强调：“加强师德师风建设，培养高素质教师队伍，弘扬尊师重教社会风尚。”[1]师德师风建设又被提到一个新的高度，引起了学界更为广泛的关注。教师职业违规行为处理是规范教师队伍管理、防范教师不当行为发生的必要举措。而相较于此类处理措施的严肃性和严格性，我国目前现行有效的教师管理规定不足以指导实践中频发的教师违规行为处理：一是处理依据位阶低；二是处理依据名目繁多，各地各学校异化严重，出现诸多“同案不同判”的现象。正是基于以上两个原因，实践中的教师违规行为处理多受舆论的“挟持”。近年来，随着网络不断深入人们生活，微博、微信等社交平台频繁爆出教师因为诸如道德失范等行为而被学校开除的新闻，引发舆情事件。随着对这些问题的深入了解，不难发现，社会新闻中引起热议的主要是教师的道德失范行为，(1)相较而言，鲜有教师因违规行为而受学校处理

Timer unit: 1e-09 s

Total time: 3.96186 s
File: /var/folders/r4/qqn8tv4d7g18t_5fsxt6y1340000gn/T/ipykernel_39228/459459994.py
Function: get_article_content at line 18

Line #      Hits         Time  Per Hit   % Time  Line Contents
    18                                           def get_article_content(links, timeout=10):
    19        11      44000.0   4000.0      0.0      for link in links:
    20        10       2000.0    200.0      0.0          title = ''
    21        10       4000.0    400.0      0.0          authors = ''
    22        10          0.0      0.0      0.0          date = ''
    23        10       7000.0    700.0      0.0          article_text = ''
    24        10          0.0      0.0      0.0          try:
    25        20 3720161000.0    2e+08     93.9              with requests.get(link) as page:
    26                                                           
    27                                                           # html = page.read().decode("utf-8")

- starting: 4.3 sec for 10 articles
- changing it to be page.text instead of page.content: 4.3 sec (4.28)
- so it's basically the same...
