In [144]:
import pandas as pd

qiushi_articles = pd.read_csv('/Users/calebharding/Documents/BYU/2023-2024/China_Project/articles/process_articles/process_qiushi_articles/validated_qiushi_articles.csv')
asx_articles = pd.read_csv('/Users/calebharding/Documents/BYU/2023-2024/China_Project/articles/process_articles/process_asx_articles/validated_asx_articles.csv')
print(f'Qiushi articles shape: {qiushi_articles.shape}')
print(f'ASX articles shape: {asx_articles.shape}')

Qiushi articles shape: (1998, 9)
ASX articles shape: (13361, 8)


In [145]:
asx_articles.head(1)

Unnamed: 0,title,authors,date,url,text,year,quarter,year_quarter
0,告别国家法一元论,强世功,2019-01-01,https://www.aisixiang.com/data/114287.html,在1995年发表的《秋菊的困惑和三杠爷的悲剧》这篇法律社会学经典论文中，苏力在法学界率先采...,2019,1,2019 Q1


In [146]:
qiushi_articles.head(1)

Unnamed: 0,authors,title,date,year,edition,url,text,quarter,year_quarter
0,习近平,辩证唯物主义是中国共产党人的世界观和方法论,2018-12-31,2019,1.0,http://www.qstheory.cn/dukan/qs/2018-12/31/c_1...,辩证唯物主义是中国共产党人的世界观和方法论习近平2018年5月4日，纪念马克思诞辰200周年...,1,2019 Q1


In [147]:
def get_authorship_counts(df, top_n, individuals_only=False) -> pd.DataFrame:
    """Returns a dataframe of authors and the number of articles they wrote. 
    
    Returns the top_n authors"""

    # only keep needed columns
    df = df[['authors', 'title', 'year_quarter', 'url']]

    # drop articles without authors
    df = df.dropna(subset=['authors'], axis=0)

    author_article_count = {}

    # increase a counter for every article they authored
    for index, article in df.iterrows():
        for author in article['authors'].split(" "):
            if author in author_article_count.keys():
                temp = author_article_count[author]
                author_article_count[author] = temp + 1
            else: 
                author_article_count[author] = 1

    author_article_count = dict(sorted(author_article_count.items(), key=lambda item: item[1], reverse=True))
    
    # an additional precaution to catch unauthored articles
    try:
        author_article_count.pop("")
    except:
        pass

    # convert to a dataframe
    author_article_count_df = pd.DataFrame.from_dict(author_article_count, 'index', columns=['Articles'])
    
    # drop organizations if toggled
    if individuals_only:
        author_article_count_df = author_article_count_df[author_article_count_df.index.map(len) < 5]

    # return the first n rows
    return author_article_count_df.iloc[0:top_n, :]

def fraction_of_whole(top_author_df: pd.DataFrame, articles_df: pd.DataFrame) -> float:
    '''Computes the proportion of all articles written by the top authors'''

    top_authors_articles = top_author_df['Articles'].sum()
    total_articles = len(articles_df)

    return top_authors_articles / total_articles


In [148]:
qiushi_counts = get_authorship_counts(qiushi_articles, 10)
asx_counts = get_authorship_counts(asx_articles, 10)

In [149]:
qiushi_ind_counts = get_authorship_counts(qiushi_articles, 10, True)

In [150]:
qiushi_counts.to_excel('./author_info/qiushi_all_authors.xlsx')
asx_counts.to_excel('./author_info/asx_all_authors.xlsx')
qiushi_ind_counts.to_excel('./author_info/qiushi_ind_authors.xlsx')


In [151]:
# no difference for ASX, they are all individuals
print('Prop. of ASX articles written by top authors {:.3f}'.format(fraction_of_whole(asx_counts, asx_articles)))

print('Prop. of Qiushi articles written by all top authors {:.3f}'.format(fraction_of_whole(qiushi_counts, qiushi_articles)))
print('Prop. of Qiushi articles written by top individual authors {:.3f}'.format(fraction_of_whole(qiushi_ind_counts, qiushi_articles)))


Prop. of ASX articles written by top authors 0.125
Prop. of Qiushi articles written by all top authors 0.212
Prop. of Qiushi articles written by top individual authors 0.112
