In [1]:
import sys
import platform
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urlencode
import urllib3
import string
import json

#### https://csdl-api.computer.org/api/v1/graphql (GraphiQL 주소)

##### Periodicals로 idPrefix, title, yearFrom, yearTo 얻기

In [2]:
resp = requests.post("https://csdl-api.computer.org/api/v1/graphql", 
                      json={
                          "query":'query { periodicals(pubType:"journal") { idPrefix title yearFrom yearTo } }'
                      })
body = resp.json()

In [3]:
print(json.dumps(body, indent=4))

{
    "data": {
        "periodicals": [
            {
                "idPrefix": "cq",
                "title": "Colloquium",
                "yearFrom": 2017,
                "yearTo": 2019
            },
            {
                "idPrefix": "dc",
                "title": "IEEE Journal on Exploratory Solid-State Computational Devices and Circuits",
                "yearFrom": 2015,
                "yearTo": 2019
            },
            {
                "idPrefix": "ta",
                "title": "IEEE Transactions on Affective Computing",
                "yearFrom": 2010,
                "yearTo": 2019
            },
            {
                "idPrefix": "bd",
                "title": "IEEE Transactions on Big Data",
                "yearFrom": 2015,
                "yearTo": 2019
            },
            {
                "idPrefix": "cc",
                "title": "IEEE Transactions on Cloud Computing",
                "yearFrom": 2013,
                "yearTo": 2019


In [4]:
journals = [] # (title, idPrefix, yearFrom, yearTo)

for i in range(0,len(body["data"]["periodicals"])):
    journals.append( ( (body["data"]["periodicals"][i]['title']), body["data"]["periodicals"][i]['idPrefix'], body["data"]["periodicals"][i]['yearFrom'], body["data"]["periodicals"][i]['yearTo']) )

In [5]:
journals

[('Colloquium', 'cq', 2017, 2019),
 ('IEEE Journal on Exploratory Solid-State Computational Devices and Circuits',
  'dc',
  2015,
  2019),
 ('IEEE Transactions on Affective Computing', 'ta', 2010, 2019),
 ('IEEE Transactions on Big Data', 'bd', 2015, 2019),
 ('IEEE Transactions on Cloud Computing', 'cc', 2013, 2019),
 ('IEEE Transactions on Computational Intelligence and AI in Games',
  'ci',
  2009,
  2017),
 ('IEEE Transactions on Computers', 'tc', 1968, 2019),
 ('IEEE Transactions on Dependable and Secure Computing', 'tq', 2004, 2019),
 ('IEEE Transactions on Emerging Topics in Computing', 'ec', 2013, 2019),
 ('IEEE Transactions on Haptics', 'th', 2008, 2017),
 ('IEEE Transactions on Knowledge & Data Engineering', 'tk', 1989, 2019),
 ('IEEE Transactions on Learning Technologies', 'lt', 2008, 2017),
 ('IEEE Transactions on Mobile Computing', 'tm', 2002, 2019),
 ('IEEE Transactions on Multi-Scale Computing Systems', 'mc', 2015, 2018),
 ('IEEE Transactions on Network Science and Engin

In [6]:
labels = ["journalName", "journalId", "yearFrom", "yearTo"]
journals_meta_df = pd.DataFrame.from_records(journals, columns=labels)

In [7]:
journals_meta_df

Unnamed: 0,journalName,journalId,yearFrom,yearTo
0,Colloquium,cq,2017,2019
1,IEEE Journal on Exploratory Solid-State Comput...,dc,2015,2019
2,IEEE Transactions on Affective Computing,ta,2010,2019
3,IEEE Transactions on Big Data,bd,2015,2019
4,IEEE Transactions on Cloud Computing,cc,2013,2019
5,IEEE Transactions on Computational Intelligenc...,ci,2009,2017
6,IEEE Transactions on Computers,tc,1968,2019
7,IEEE Transactions on Dependable and Secure Com...,tq,2004,2019
8,IEEE Transactions on Emerging Topics in Computing,ec,2013,2019
9,IEEE Transactions on Haptics,th,2008,2017


##### PeriodicalIssues로 year, volume, issueNum 얻기

In [8]:
journals_df = pd.DataFrame()

for i in range(0, len(journals)) :
    resp = requests.post("https://csdl-api.computer.org/api/v1/graphql", 
                          json={
                              "variables":{"idPrefix":"%s"%(journals[i][1])},
                              "query":'query ($idPrefix : String!) { periodicalIssues(idPrefix : $idPrefix) { label issueNum volume year } }'
                          })
    body = resp.json()
    
    labels = ["journalId", "year", "issueNum", "volume"]
    temp_df = pd.DataFrame(body['data']['periodicalIssues'], columns=labels)
    temp_df['journalId'] = journals[i][1]
    journals_df = journals_df.append(temp_df)
    
    
    #Todo : 저널 별로 year, volume, issueNum 저장
    print("Progress : %d of %d (%s)" % (i+1, len(journals), journals[i][1]))

Progress : 1 of 26 (cq)
Progress : 2 of 26 (dc)
Progress : 3 of 26 (ta)
Progress : 4 of 26 (bd)
Progress : 5 of 26 (cc)
Progress : 6 of 26 (ci)
Progress : 7 of 26 (tc)
Progress : 8 of 26 (tq)
Progress : 9 of 26 (ec)
Progress : 10 of 26 (th)
Progress : 11 of 26 (tk)
Progress : 12 of 26 (lt)
Progress : 13 of 26 (tm)
Progress : 14 of 26 (mc)
Progress : 15 of 26 (tn)
Progress : 16 of 26 (td)
Progress : 17 of 26 (tp)
Progress : 18 of 26 (sc)
Progress : 19 of 26 (ts)
Progress : 20 of 26 (su)
Progress : 21 of 26 (si)
Progress : 22 of 26 (tg)
Progress : 23 of 26 (tb)
Progress : 24 of 26 (nt)
Progress : 25 of 26 (ca)
Progress : 26 of 26 (lc)


In [9]:
journals_df

Unnamed: 0,journalId,year,issueNum,volume
0,cq,2017,01,1
1,cq,2017,02,1
2,cq,2017,03,1
3,cq,2017,04,1
4,cq,2017,05,1
5,cq,2017,06,1
6,cq,2017,07,1
7,cq,2017,08,1
8,cq,2017,09,1
9,cq,2017,10,1


In [10]:
journals_groupbyID = journals_df.groupby('journalId')

In [11]:
journals_groupbyID.get_group('cq')

Unnamed: 0,journalId,year,issueNum,volume
0,cq,2017,1,1
1,cq,2017,2,1
2,cq,2017,3,1
3,cq,2017,4,1
4,cq,2017,5,1
5,cq,2017,6,1
6,cq,2017,7,1
7,cq,2017,8,1
8,cq,2017,9,1
9,cq,2017,10,1


##### articles과 param (idPrefix, year, issueNum)을 통해 article 크롤링

In [12]:

def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [13]:
all_articles_df = pd.DataFrame()

for key in log_progress(journals_groupbyID.groups.keys()):
    temp_df = journals_groupbyID.get_group(key)
    
    for _, row in temp_df.iterrows():
        resp = requests.post("https://csdl-api.computer.org/api/v1/graphql", 
                          json={
                              "variables":{"idPrefix":row['journalId'], "year":row['year'], "issueNum":row['issueNum']}, 
                              "query":'query ($idPrefix : String!, $year : String!, $issueNum : String!) { articles(idPrefix: $idPrefix, year: $year, issueNum: $issueNum) { idPrefix id year pubDate keywords title abstract authors { affiliation fullName givenName surname } } }'
                          })
        body = resp.json()
        labels = ['idPrefix', 'id', 'pubDate', 'keywords', 'title', 'abstract']
        all_articles_df = all_articles_df.append(pd.DataFrame.from_records(body['data']['articles'], columns=labels))

VBox(children=(HTML(value=''), IntProgress(value=0, max=26)))

In [14]:
all_articles_df

Unnamed: 0,idPrefix,id,pubDate,keywords,title,abstract
0,bd,13rRUwwslvh,2015-01-01,[],Welcome to the IEEE Transactions on Big Data,Presents an editorial introducting the inaugur...
1,bd,13rRUygT7c8,2015-01-01,[],Introduction to the IEEE Transactions on Big Data,Presents an introduction to the inaugural issu...
2,bd,13rRUwvT9j1,2015-01-01,"[Data Integration, Big Data, Data Mining, Feat...",Methodologies for Cross-Domain Data Fusion: An...,Traditional data mining usually deals with dat...
3,bd,13rRUxBJhxj,2015-01-01,"[Semantics, Internet, Electronic Publishing, S...",Embracing Information Explosion without Chokin...,The explosive popularity of microblogging serv...
0,bd,13rRUxAStUe,2015-07-01,"[Data Models, Computational Modeling, Big Data...",Petuum: A New Platform for Distributed Machine...,What is a systematic way to efficiently apply ...
1,bd,13rRUxASu2L,2015-07-01,"[Distributed Databases, Base Stations, Data Pr...",SMC: A Practical Schema for Privacy-Preserved ...,Data collection is required to be safe and eff...
0,bd,13rRUxAASMi,1900-01-01,[],Guest Editorial: Big Media Data: Understanding...,
1,bd,13rRUwbJCYJ,1900-01-01,"[Binary Codes, Error Correction Codes, Error C...",Code Consistent Hashing Based on Information-T...,Learning based hashing techniques have attract...
2,bd,13rRUzphDs2,1900-01-01,"[Visualization, Search Engines, Google, Big Da...",Exploration of Image Search Results Quality As...,Image retrieval plays an increasingly importan...
3,bd,13rRUNvyan1,1900-01-01,"[Training, Semantics, Machine Learning, Visual...",Weakly Semi-Supervised Deep Learning for Multi...,"In this paper, we study leveraging both weakly..."


In [18]:
all_articles_df.to_pickle('all_articles_df.p')

In [38]:
all_articles_df = pd.read_pickle('all_articles_df.p')