# AAAI 2019 論文發表單位排行榜

## 各研究單位之論文發表數量排行榜

### 取出論文作者資訊
- 每個csv檔案中的第2行
- 多位作者以逗點隔開
- 作者與組織名稱以@符合隔開

### 匯集單位清單
- 每篇論文的各單位只計算1次，不論有多少位作者

### 統計與排序
- 統計各單位出現次數
- 依照出現次數排序
- 列出前50名

In [1]:
import os
import numpy as np
import pandas as pd
from collections import Counter
from enum import Enum

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
#nltk.download('wordnet')
porter_stemmer = PorterStemmer()

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

### 自定義 data types and functions

In [2]:
class ContentType(Enum):
    TIT = 'title'
    ABS = 'abstract'
    AUT = 'author'
    SEC = 'section'

In [3]:
def get_contents(content_type):
    all_contents = []
    dataset_path = '../dataset'
    for file in os.listdir(dataset_path):
        file_path = os.path.join(dataset_path, file)
        if os.path.isfile(file_path):
            with open(file_path) as f:  
                line = f.readlines()
                if content_type == ContentType.AUT:
                    line = line[1]
                elif content_type == ContentType.SEC:
                    line = line[2]
                elif content_type == ContentType.ABS:
                    line = line[3]
                else:
                    line = line[0]
                line = line.strip()
                all_contents.append(line)
        else:
            print(file_path + ' does not exist.')
    return all_contents

In [4]:
def get_all_titles():
    return get_contents(ContentType.TIT)

In [5]:
def get_all_authors():        
    return get_contents(ContentType.AUT)

In [6]:
def get_all_abstracts():
    return get_contents(ContentType.ABS)

In [7]:
def get_unique_author_org(data):
    all_org = []
    all_unique_org = []
    
    many_authors = data.split(',')
    for author in many_authors:
        if '@' in author and len(author[author.index('@')+1:]) > 0:
            org = author.split('@')[1].strip()
            all_org.append(org)

    org_counter = Counter(all_org)
    
    for c in org_counter:
        all_unique_org.append(c)

    return all_unique_org

### 進行次數統計與排序

In [8]:
author_org_list = []

all_author_data = get_all_authors()
for data in all_author_data:
    # 每篇論文的各單位只計算1次，不論有多少位作者
    unique_author_org = get_unique_author_org(data)
    author_org_list = author_org_list + unique_author_org
    
author_org_counter = Counter(author_org_list)
print('共',len(author_org_counter),'不重複的單位')

author_org_top50 = author_org_counter.most_common(50)

共 947 不重複的單位


In [10]:
df = pd.DataFrame(data=author_org_top50,columns=['單位','論文篇數'])
df

Unnamed: 0,單位,論文篇數
0,Tsinghua University,58
1,Chinese Academy of Sciences,54
2,University of California,48
3,Peking University,44
4,Carnegie Mellon University,42
5,IBM Research,32
6,Beihang University,31
7,Microsoft Research,31
8,University of Science and Technology of China,29
9,National University of Singapore,27


### 台灣的學校

In [20]:
taiwan_school = []
for a in author_org_counter:
    if 'taiwan' in a.lower():
        taiwan_school.append((a,author_org_counter[a]))

df2 = pd.DataFrame(data = taiwan_school, columns=['學校','論文篇數'])
df2

Unnamed: 0,學校,論文篇數
0,National Taiwan University,3
1,National Taiwan University of Science and Tech...,1
2,National Taiwan Normal University,1
