In [49]:
import pandas as pd

articles = pd.read_csv('../process_articles/validated_articles.csv')
print(f'Articles shape: {articles.shape}')

Articles shape: (2014, 9)


In [50]:
# keep the authors, title, url, and year_quarter
keep = ['authors', 'title', 'year_quarter', 'url']
articles.drop(columns=[column for column in articles.columns if column not in keep], inplace=True)
total_articles = len(articles)
# save a csv so I can find the authors easily
articles.to_csv('./author_info/articles.csv')
print(f'Articles shape: {articles.shape}')

# then drop any that are missing the author data
articles.dropna(subset=['authors'], axis=0, inplace=True)
articles_w_author = len(articles)
print(f'Articles shape: {articles.shape}')

Articles shape: (2014, 4)
Articles shape: (1714, 4)


I need to begin with the end in mind - what is the research question? 

Questions:
- Who are the most important voices in Qiushi (as measured by frequency - which may not be the best metric)?
- Have the most important voices changed over time? 
- Who are the most important writers on a given topic? - how to tell if it is on a given topic? If it is in the title, is that enough? 

To track frequency over time, maybe have a table with authors as rows, and then two columns for each quarter: one with an article count, and another with the titles for that quarter? 

I want to get the total article count for each author, and also be able to do an articles/quarter count for each of them

In [51]:
# start with a simple count per author
author_article_count = {}

from ast import literal_eval

for index, article in articles.iterrows():
    for author in literal_eval(article['authors']):
        if author in author_article_count.keys():
            temp = author_article_count[author]
            author_article_count[author] = temp + 1
        else: 
            author_article_count[author] = 1

author_article_count

{'习近平': 123,
 '《求是》杂志编辑部': 72,
 '王毅': 11,
 '《求是》杂志评论员': 76,
 '国家发展和改革委员会': 9,
 '中共江苏省委': 1,
 '中共深圳市委': 1,
 '铁凝': 6,
 '王可': 1,
 '周昭成': 8,
 '陈聪': 4,
 '聂悄语': 1,
 '黎海华': 10,
 '吴擒虎': 1,
 '梁佩韵': 12,
 '刘骏娇': 1,
 '李飞': 3,
 '姜小薇': 1,
 '李豪杰': 1,
 '王文涛': 4,
 '侯凯': 1,
 '李文章': 2,
 '左中一': 2,
 '曲莹璞': 1,
 '谭瑞松': 1,
 '习近平生态文明思想研究中心': 3,
 '北京市习近平新时代中国特色社会主义思想研究中心': 3,
 '魏天舒': 8,
 '吉文磊': 1,
 '蔡春玲': 7,
 '樊遂桥': 1,
 '申小提': 3,
 '张颐佳': 1,
 '狄英娜': 10,
 '翟汝增': 1,
 '何雨奇': 1,
 '《机关党建研究》评论员': 3,
 '中共中央党校（国家行政学院）校（院）务委员会': 3,
 '中共中央党史和文献研究院院务会理论学习中心组': 4,
 '共青团中央书记处理论学习中心组': 1,
 '中共中国社会科学院党组': 3,
 '倪岳峰': 3,
 '王宁': 1,
 '慎海雄': 10,
 '余剑锋': 2,
 '沈壮海': 3,
 '盛玮': 7,
 '吴晓迪': 4,
 '杜晨薇': 1,
 '陈有勇': 8,
 '王磊': 1,
 '刘忠培': 1,
 '《中国纪检监察》评论员': 5,
 '中共上海市委': 4,
 '景俊海': 4,
 '金壮龙': 1,
 '刘昆': 4,
 '中共全国工商联党组': 2,
 '中共国家统计局党组': 7,
 '徐艳玲': 1,
 '王成果': 1,
 '周璐铭': 6,
 '张春林': 2,
 '旷思思': 9,
 '章丹': 1,
 '侯亚景': 12,
 '咸文静': 2,
 '王志琦': 1,
 '秦九鸿': 1,
 '新华社记者': 38,
 '钧政': 3,
 '庄荣文': 2,
 '中共全国妇联党组': 3,
 '全国人大常委会法制工作委员会': 4,
 '程同顺': 1,
 '贺军科': 3,
 '吴

In [52]:
len(author_article_count)

955

In [53]:
author_article_count["张芯蕊"]

1

In [39]:
sorted_author_article_count = {k: v for k, v in sorted(author_article_count.items(), key=lambda item: item[1], 
                                                       reverse=True) if v >= 10}
sorted_author_article_count

{'习近平': 123,
 '《求是》杂志评论员': 76,
 '《求是》杂志编辑部': 72,
 '本刊编辑部': 47,
 '新华社记者': 38,
 '本刊评论员': 19,
 '同心': 14,
 '宁吉喆': 13,
 '梁佩韵': 12,
 '侯亚景': 12,
 '巨力': 12,
 '王毅': 11,
 '黎海华': 10,
 '狄英娜': 10,
 '慎海雄': 10,
 '曲青山': 10,
 '中共中央党史和文献研究院': 10}

In [40]:
df = pd.DataFrame.from_dict(sorted_author_article_count, 'index', columns=['article_count'])
df.to_excel('./author_info/all_authors.xlsx')
df

Unnamed: 0,article_count
习近平,123
《求是》杂志评论员,76
《求是》杂志编辑部,72
本刊编辑部,47
新华社记者,38
本刊评论员,19
同心,14
宁吉喆,13
梁佩韵,12
侯亚景,12


In [41]:
len(sorted_author_article_count)

17

In [43]:
print('Proportion of articles with author identified authored by 10+ article authors: {:.3f}'.format(sum(sorted_author_article_count.values()) / articles_w_author))
print('Proportion of all articles authored by 10+ article authors: {:.3f}'.format(sum(sorted_author_article_count.values()) / total_articles))


Proportion of articles with author identified authored by 10+ article authors: 0.291
Proportion of all articles authored by 10+ article authors: 0.248


In [44]:
# remove the organizations, preserving just individuals - anything with more than 4 characters get axed
for key in list(sorted_author_article_count.keys()):
    if len(key) > 4:
        sorted_author_article_count.pop(key)

sorted_author_article_count

{'习近平': 123,
 '同心': 14,
 '宁吉喆': 13,
 '梁佩韵': 12,
 '侯亚景': 12,
 '巨力': 12,
 '王毅': 11,
 '黎海华': 10,
 '狄英娜': 10,
 '慎海雄': 10,
 '曲青山': 10}

In [45]:
df = pd.DataFrame.from_dict(sorted_author_article_count, 'index', columns=['article_count'])
df.to_excel('./author_info/individual_authors.xlsx')

In [46]:
print('Proportion of articles with author identified authored by individuals with 10+ articles: {:.3f}'.format(sum(sorted_author_article_count.values()) / articles_w_author))
print('Proportion of all articles authored by individuals with 10+ articles: {:.3f}'.format(sum(sorted_author_article_count.values()) / total_articles))


Proportion of articles with author identified authored by individuals with 10+ articles: 0.138
Proportion of all articles authored by individuals with 10+ articles: 0.118
