In [1]:
import pandas as pd

In [2]:
articles = pd.read_csv('../get_articles/all_articles.csv')
print(f"Total number of articles scraped: {len(articles)}")

Total number of articles scraped: 2181


In [3]:
missing_year = articles[articles['year'].isnull()]
# missing_year.to_csv('missing_year.csv')
len(missing_year)

0

In [4]:
def determine_edition_quarter(edition_number):
    if edition_number in range(1,7):
        return 1
    elif edition_number in range(7,13):
        return 2
    elif edition_number in range(13, 19):
        return 3
    else:
        return 4
    
def combine_year_quarter(row):
    return str(row["year"]) + " Q" + str(row["quarter"])

# drop articles without a year
articles = articles.dropna(subset=["year"])
print(f"Number of articles after dropping missing years: {len(articles)}")
    
# add a column for the quarter
articles["quarter"] = articles['edition'].map(lambda x: determine_edition_quarter(x))

# add a column for the year/quarter combo
articles = articles.astype({"year": int})
articles["year_quarter"] = articles.apply(combine_year_quarter, axis=1)

Number of articles after dropping missing years: 2181


In [5]:
# verify that the textless articles are what we think - statistical reports, not just
# articles that failed to scrape
textless_articles = articles[articles['text'].isnull()]
textless_articles.to_csv('textless_articles.csv')

articles.dropna(subset=['text'], inplace=True)
print(f'Number of articles after dropping those with no text: {len(articles)}')

Number of articles after dropping those with no text: 2058


In [6]:
# drop articles that are top-level articles with duplicate text contained - they have three+ articles
# from the same edition copied inside of them
suspected_toplevel = articles[articles.text.str.contains('编者按')]
suspected_toplevel['url'].to_csv('suspected_toplevel.csv')
toplevel_links = list(suspected_toplevel['url'])
print(f'Suspected top-level links: {len(toplevel_links)}')

# remove the few links that are actually legit (determined by looking at all 16)
legit_links = ['http://www.qstheory.cn/dukan/qs/2019-01/16/c_1123987212.htm',
               'http://www.qstheory.cn/dukan/qs/2019-01/16/c_1123987342.htm',
               'http://www.qstheory.cn/dukan/qs/2019-07/01/c_1124690404.htm']
for link in legit_links:
    toplevel_links.remove(link)
print(f'Top-level links: {len(toplevel_links)}')

Suspected top-level links: 16
Top-level links: 13


In [7]:
# drop the articles
articles = articles[~articles['url'].isin(toplevel_links)]
print(f'Number of articles after dropping top-level entries: {len(articles)}')

Number of articles after dropping top-level entries: 2045


In [8]:
# for the sake of consistency, drop the articles that are from 2024, since there 
# isn't a complete quarter yet
articles = articles[articles['year'] != 2024]
print(f'Number of articles after dropping 2024 entries: {len(articles)}')

Number of articles after dropping 2024 entries: 2014


In [9]:
# somehow some duplicates of Xi Jinping's speeches got in there... cause their website is dumb. They have duplicate links though

# go through all the articles, add their link to a set, if the link is already in the set, drop it
link_set = set()

for index, row in articles.iterrows():
    if row['url'] in link_set:
        articles.drop(index, inplace=True)
    else:
        link_set.add(row['url'])

print(f'Number of articles after dropping duplicate links: {len(articles)}')

Number of articles after dropping duplicate links: 1998


In [10]:
# sort it by date
articles.sort_values(by='date', inplace=True)

In [11]:
articles

Unnamed: 0,authors,title,date,year,edition,url,text,quarter,year_quarter
1790,['习近平'],辩证唯物主义是中国共产党人的世界观和方法论,2018-12-31,2019,1.0,http://www.qstheory.cn/dukan/qs/2018-12/31/c_1...,辩证唯物主义是中国共产党人的世界观和方法论习近平2018年5月4日，纪念马克思诞辰200周年...,1,2019 Q1
1799,"['刘雅鸣', '陈聪', '李亚楠', '宋晓东']",兰考：会它千顷澄碧,2019-01-01,2019,1.0,http://www.qstheory.cn/dukan/qs/2019-01/01/c_1...,焦裕禄当年亲手栽下的幼桐已长成大树，人们亲切地叫它“焦桐”。 新华社记者 冯大鹏/摄兰考谷营...,1,2019 Q1
1804,['国家统计局'],经济社会发展统计：改革开放40年辉煌成就（经济篇）,2019-01-01,2019,1.0,http://www.qstheory.cn/dukan/qs/2019-01/01/c_1...,我国国民经济主要指标注：国内生产总值增速按可比价计算，2017年国内生产总值总量不等于各产业...,1,2019 Q1
1803,['《中直党建》评论员'],多敲警钟才能少敲丧钟,2019-01-01,2019,1.0,http://www.qstheory.cn/dukan/qs/2019-01/01/c_1...,中央和国家机关警示教育大会，对各部门开展党风廉政警示教育活动进行了部署。落实好会议精神，把警...,1,2019 Q1
1802,['李君'],学理论是党员领导干部的责任,2019-01-01,2019,1.0,http://www.qstheory.cn/dukan/qs/2019-01/01/c_1...,编辑同志：我是广西南宁的一名基层理论工作者，从事理论工作近20年了。在长期工作实践中，我深深...,1,2019 Q1
...,...,...,...,...,...,...,...,...,...
489,['中共中央党史和文献研究院'],永远铭记毛泽东同志的丰功伟绩和崇高风范,2023-12-16,2023,24.0,http://www.qstheory.cn/dukan/qs/2023-12/16/c_1...,永远铭记毛泽东同志的丰功伟绩和崇高风范中共中央党史和文献研究院今年是中国共产党、中国人民解放...,4,2023 Q4
483,,本期导读,2023-12-16,2023,24.0,http://www.qstheory.cn/dukan/qs/2023-12/16/c_1...,本期发表了习近平总书记的重要文章《在二十届中央机构编制委员会第一次会议上的讲话》。文章强调，...,4,2023 Q4
487,['中央机构编制委员会办公室'],扎实推进新时代新征程机构编制工作高质量发展,2023-12-16,2023,24.0,http://www.qstheory.cn/dukan/qs/2023-12/16/c_1...,扎实推进新时代新征程机构编制工作高质量发展中央机构编制委员会办公室习近平总书记在二十届中央机...,4,2023 Q4
493,['潘岳'],铸牢中华民族共同体意识,2023-12-16,2023,24.0,http://www.qstheory.cn/dukan/qs/2023-12/16/c_1...,铸牢中华民族共同体意识潘 岳党的十八大以来，习近平总书记对中国特色社会主义文化建设提出的一系...,4,2023 Q4


In [12]:
articles.to_csv('validated_articles.csv', index=False)