In [8]:
import pandas as pd

In [9]:
articles = pd.read_csv('../get_articles/all_articles.csv')
print(f"Total number of articles scraped: {len(articles)}")

Total number of articles scraped: 2181


In [10]:
missing_year = articles[articles['year'].isnull()]
# missing_year.to_csv('missing_year.csv')
len(missing_year)

0

In [11]:
def determine_edition_quarter(edition_number):
    if edition_number in range(1,7):
        return 1
    elif edition_number in range(7,13):
        return 2
    elif edition_number in range(13, 19):
        return 3
    else:
        return 4
    
def combine_year_quarter(row):
    return str(row["year"]) + " Q" + str(row["quarter"])

# drop articles without a year
articles = articles.dropna(subset=["year"])
print(f"Number of articles after dropping missing years: {len(articles)}")
    
# add a column for the quarter
articles["quarter"] = articles['edition'].map(lambda x: determine_edition_quarter(x))

# add a column for the year/quarter combo
articles = articles.astype({"year": int})
articles["year_quarter"] = articles.apply(combine_year_quarter, axis=1)

Number of articles after dropping missing years: 2181


In [12]:
# verify that the textless articles are what we think - statistical reports, not just
# articles that failed to scrape
textless_articles = articles[articles['text'].isnull()]
textless_articles.to_csv('textless_articles.csv')

articles.dropna(subset=['text'], inplace=True)
print(f'Number of articles after dropping those with no text: {len(articles)}')

Number of articles after dropping those with no text: 2058


In [13]:
# drop articles that are top-level articles with duplicate text contained - they have three+ articles
# from the same edition copied inside of them
suspected_toplevel = articles[articles.text.str.contains('编者按')]
suspected_toplevel['url'].to_csv('suspected_toplevel.csv')
toplevel_links = list(suspected_toplevel['url'])
print(f'Suspected top-level links: {len(toplevel_links)}')

# remove the few links that are actually legit (determined by looking at all 16)
legit_links = ['http://www.qstheory.cn/dukan/qs/2019-01/16/c_1123987212.htm',
               'http://www.qstheory.cn/dukan/qs/2019-01/16/c_1123987342.htm',
               'http://www.qstheory.cn/dukan/qs/2019-07/01/c_1124690404.htm']
for link in legit_links:
    toplevel_links.remove(link)
print(f'Top-level links: {len(toplevel_links)}')

Suspected top-level links: 16
Top-level links: 13


In [14]:
# drop the articles
articles = articles[~articles['url'].isin(toplevel_links)]
print(f'Number of articles after dropping top-level entries: {len(articles)}')

Number of articles after dropping top-level entries: 2045


In [15]:
# for the sake of consistency, drop the articles that are from 2024, since there 
# isn't a complete quarter yet
articles = articles[articles['year'] != 2024]
print(f'Number of articles after dropping 2024 entries: {len(articles)}')

Number of articles after dropping 2024 entries: 2014


In [16]:
articles.to_csv('validated_articles.csv', index=False)