In [24]:
import pandas as pd

articles = pd.read_csv('../../scrape_articles/get_aisixiang_articles/all_asx_articles.csv')
len(articles)

56245

In [25]:
articles.head(1)

Unnamed: 0,title,authors,date,url,text
0,计划生育制度变革与法治化,湛中乐 苏宇,2024-03-09,https://www.aisixiang.com/data/149731.html,【摘要】中国计划生育事业已经走过了从纯粹的政策主导到形式法治的历程，而实质法治进程亦已开始...


In [26]:
articles.isna().sum()

title        2
authors    950
date         0
url          0
text       413
dtype: int64

In [27]:
# the number of articles missing text is a tiny fraction of the total, so it 
# seems reasonably random, or may simply be graphics
articles = articles.dropna(subset=['text'])
len(articles)

55832

In [28]:
# check to see if there are duplicate articles
len(articles['url'].unique())

55412

In [29]:
# go through all the articles, add their link to a set, if the link is already in the set, drop it
link_set = set()

for index, row in articles.iterrows():
    if row['url'] in link_set:
        articles.drop(index, inplace=True)
    else:
        link_set.add(row['url'])

print(f'Number of articles after dropping duplicate links: {len(articles)}')

Number of articles after dropping duplicate links: 55412


In [30]:
# sort it by date
articles.sort_values(by='date', inplace=True)

In [31]:
# create additional date information
articles['year'] = articles['date'].apply(lambda x: int(x[0:4]))

def get_quarter(full_date):
    year, month, date = full_date.split('-')
    month = int(month)
    if month <= 3:
        return 1
    elif month <= 6:
        return 2
    elif month <= 9:
        return 3
    else:
        return 4
    
articles['quarter'] = articles['date'].apply(lambda x: get_quarter(x))

def combine_year_quarter(row):
    return str(row["year"]) + " Q" + str(row["quarter"])

articles['year_quarter'] = articles.apply(combine_year_quarter, axis=1)

In [32]:
articles.head(1)

Unnamed: 0,title,authors,date,url,text,year,quarter,year_quarter
26638,耐心，自信心，爱心与压力,台湾问题,2001-04-01,https://www.aisixiang.com/data/1397.html,现在台湾新领导人面临内外交困之势，台湾问题解决的走向出现了一些新的迹象，值得人们用心思考...,2001,2,2001 Q2


In [33]:
# limit it to 2019-2023, so we can compare to Qiushi
articles = articles[(articles['year'] >= 2019) & (articles['year'] <= 2023)]
len(articles)

13361

In [22]:
articles.to_csv('validated_asx_articles.csv', index=False)

In [23]:
import pandas as pd

# create a subset for use in testing
articles = pd.read_csv('validated_asx_articles.csv')
articles = articles.sample(100)
articles.to_csv('asx_sample.csv')