In [1]:
from urllib.request import urlopen, urlretrieve
from bs4 import BeautifulSoup
import re
import time
import json
import random
import urllib

## Database Information 

Database Name: Paper_metadata.json

Data Source: sciencemag.org

Metadata: Title, Publication Year, Author, Category, Institution, DOI, Subject, Content:( Abstract, Citation, Full Content)

### JSON Structure

{"maximumIndex":metadata, "data":{content}}

content(dictionary, keys are shown below):

{'type',
'title',
'author',
'authorInstitution',
'publisher',
'date',
'doi',
'pdfLink',
'textLink',
'text',
'abstract',
'discussion'}


## Webpage Structure

The archives in sicencemag/conten/by/year

We can find search paper by year and week

<img src='../pic/archives_by_year.png' style="width: 600px;"/>

Click one we can get paper links

<img src='../pic/contents_by_week.png' style="width: 600px;"/>

Then Analyze the links we can get some metadata and documents

In [5]:
basic_url='http://science.sciencemag.org'

In [37]:
# too many urls here, scrap the urls first
def scrapingURL(range1=2010,range2=2017):
    #########################################
    #
    # Func scrapingUPL will return urlList
    # urlList:{year:{week:{}}}
    #
    #########################################
    
    global basic_url
    urlList = {}
    #by years
    for year in range(range1,range2+1):
        #url address
        html=basic_url+'/content/by/year/'+str(year)
        
        #directory[year]
        urlList[year]={}
        
        #open the html
        b0 = urlopen(html)
        bs0bj = BeautifulSoup(b0,'html5lib')
        
        #url list
        list_ = bs0bj.find('div',{'class':'pane-content'})\
        .findAll('a',{'class':'highwire-cite-linked-title'})
        
        #
        for i in range(len(list_)):
            #weekly lests
            urlList[year][list_[i].get_text()[0:6]]=[]
            
            link=list_[i].get('href')
            b1 = urlopen(basic_url + link)
            bs1bj = BeautifulSoup(b1, 'html5lib')
            
            #paper links
            fullList = bs1bj.findAll('a', {'title': 'Full Text'})
            
            #store links
            for content in fullList:
                textUrl = content.get('href')
                urlList[year][list_[i].get_text()[0:6]].append(textUrl)
            
            #sleep
            time.sleep(5 * (1+random.random()) )
            print("finished: "+str(year)+' '+list_[i].get_text()[0:6])
    return urlList

In [38]:
urls = scrapingURL(range1=2016,range2=2016)

finished: 2016 Jan 01
finished: 2016 Jan 08
finished: 2016 Jan 15
finished: 2016 Jan 22
finished: 2016 Jan 29
finished: 2016 Feb 05
finished: 2016 Feb 12
finished: 2016 Feb 19
finished: 2016 Feb 26
finished: 2016 Mar 04
finished: 2016 Mar 11
finished: 2016 Mar 18
finished: 2016 Mar 25
finished: 2016 Apr 01
finished: 2016 Apr 08
finished: 2016 Apr 15
finished: 2016 Apr 22
finished: 2016 Apr 29
finished: 2016 May 06
finished: 2016 May 13
finished: 2016 May 20
finished: 2016 May 27
finished: 2016 Jun 03
finished: 2016 Jun 10
finished: 2016 Jun 17
finished: 2016 Jun 24
finished: 2016 Jul 01
finished: 2016 Jul 08
finished: 2016 Jul 15
finished: 2016 Jul 22
finished: 2016 Jul 29
finished: 2016 Aug 05
finished: 2016 Aug 12
finished: 2016 Aug 19
finished: 2016 Aug 26
finished: 2016 Sep 02
finished: 2016 Sep 09
finished: 2016 Sep 16
finished: 2016 Sep 23
finished: 2016 Sep 30
finished: 2016 Oct 07
finished: 2016 Oct 14
finished: 2016 Oct 21
finished: 2016 Oct 28
finished: 2016 Nov 04
finished: 

In [39]:
def scraping(basic_url,urlList,year,typeList=['Editorial','Research Article','Review Article','Letter','Book Review']):
    #text data
    textList = {}
    for week in urlList[year]:
        #classified by weeks
        textList[week] = []
        for i in range(2):
            #read url
            content = urlList[year][week][i]
            b1 = urlopen(basic_url + content)
            bs1bj = BeautifulSoup(b1, 'html5lib')
            # check if there is article categories
            # Since there are many redundant information
            try:
                articleType = bs1bj.find('meta', {'name': 'citation_article_type'}).get('content')
            except:
                articleType = 'none'
            # while the type of paper is in the list( We only scrap several specific types of articles)
            # 'Editorial','Research Article','Review Article','Letter','Book Review'
            if articleType in typeList:
                
                textList[week].append(analyzeWebpage(bs1bj))             
                time.sleep(2 * (1+random.random()) )
        print('finished:' + week)
    return textList

In [43]:
def analyzeWebpage(bs1bj):
    #title
    title = bs1bj.find('meta', {'name': 'citation_title'}).get('content')

    #Author
    authorList = bs1bj.findAll('meta', {'name': 'citation_author'})
    author = []
    for m in authorList:
        author.append(m.get('content'))
    
    #Article type
    articleType = bs1bj.find('meta', {'name': 'citation_article_type'}).get('content')

    #Institution
    institutionList = bs1bj.findAll('meta', {'name': 'citation_author_institution'})
    institution = []
    for InstiData in institutionList:
        institution.append(InstiData.get('content'))

    #Publisher
    publisher = bs1bj.find('meta', {'name': 'citation_publisher'}).get('content')

    #Publication data
    pubDate = bs1bj.find('meta', {'name': 'citation_publication_date'}).get('content')

    #DOI
    doi = bs1bj.find('meta', {'name': 'citation_doi'}).get('content')

    ##PDF (need to be verified)
    pdfLink = bs1bj.find('link', {'title': 'Full Text (PDF)'}).get('href')

    ##text (need to be verified)
    textLink = bs1bj.find('link', {'title': 'Full Text (Plain)'}).get('href')

    # put them in to directory
    textDict = {}
    textDict['type'] = articleType
    textDict['title'] = title
    textDict['author'] = author
    textDict['authorInstitution'] = institution
    textDict['publisher'] = publisher
    textDict['date'] = pubDate
    textDict['doi'] = doi
    textDict['pdfLink'] = pdfLink
    textDict['textLink'] = textLink
    # content
    textDict['text'] = ''
    textDict['abstract'] = ''
    textDict['introduction'] = ''
    textDict['discussion'] = ''

    # Data for function to search by abstrace, introduction, discussion part
    textParts = bs1bj.findAll('p', {'id': re.compile('p\-[0-9]*')})
    for finding in textParts:
        textDict['text'] += finding.get_text().replace('\n',' ') + ' '
    # Abstract
    # use try/except since articles may not have abstract
    try:
        abstractParts = bs1bj.findAll('div', {'class': 'section abstract'})
        for finding in abstractParts:
            textDict['abstract'] += finding.get_text().replace('\n',' ') + ' '
    except:
        textDict['abstract'] = None

    # Introduction
    try:
        introParts = bs1bj.findAll('div', {'class': 'section introduction'})
        for finding in introParts:
            textDict['introduction'] +=finding.get_text().replace('\n',' ') + ' '
    except:
        textDict['introduction'] = None

    # Discussion
    try:
        discussionParts = bs1bj.findAll('div', {'class': 'section discussion'})
        for finding in discussionParts:
            textDict['discussion'] +=  finding.get_text().replace('\n',' ') + ' '
    except:
        textDict['discussion'] = None
    
    time.sleep(10*(1+random.random()))

    return textDict


In [44]:
def saveasJSON(textList,year):
    model= textList #数据
    with open("../data/json/test.json",'w',encoding='utf-8') as json_file:
        maximumIndex = json_file['maximumIndex']
        
    with open("../data/json/test/data.json",'w',encoding='utf-8') as json_file:
        maximumIndex = json_file['maximumIndex']
        for week in model:
            jsonDataList = model[week]
            for d in jsonDataList:
                d
                json.dump(d,json_file,ensure_ascii=False)
            

In [45]:
textList = scraping(basic_url,urls,2016)

finished:Jan 01
finished:Jan 08
finished:Jan 15
finished:Jan 22
finished:Jan 29
finished:Feb 05
finished:Feb 12
finished:Feb 19
finished:Feb 26
finished:Mar 04
finished:Mar 11
finished:Mar 18
finished:Mar 25
finished:Apr 01
finished:Apr 08
finished:Apr 15
finished:Apr 22
finished:Apr 29
finished:May 06
finished:May 13
finished:May 20
finished:May 27
finished:Jun 03
finished:Jun 10
finished:Jun 17
finished:Jun 24
finished:Jul 01
finished:Jul 08
finished:Jul 15
finished:Jul 22
finished:Jul 29
finished:Aug 05
finished:Aug 12
finished:Aug 19
finished:Aug 26
finished:Sep 02
finished:Sep 09
finished:Sep 16
finished:Sep 23
finished:Sep 30
finished:Oct 07
finished:Oct 14
finished:Oct 21
finished:Oct 28
finished:Nov 04
finished:Nov 11
finished:Nov 18
finished:Nov 25
finished:Dec 02
finished:Dec 09
finished:Dec 16
finished:Dec 23


In [46]:
model= {'maximumIndex':0, 'data':{}} #数据
with open("../data/json/test.json",'w',encoding='utf-8') as json_file:
    json.dump(model,json_file,ensure_ascii=False)

In [158]:
#textList['Jan 06'][0]

In [47]:
p= {'maximumIndex':-1, 'data':{}} #数据


Index = p['maximumIndex']
for week in textList:
    for d in textList[week]:
        Index += 1
        p['data'][Index] = d
p['maximumIndex'] = Index

with open("../data/json/test.json",'w',encoding='utf-8') as json_file:
    json.dump(p,json_file)


In [2]:
with open("../data/json/test.json",'r',encoding='utf-8') as json_file:
    p = json.load(json_file)

In [48]:
p['data'].keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43])

In [None]:
for key in p['data'].keys():
    urlretrieve(basic_url+p['data'][key]['pdfLink'], '../data/pdf/' + str(key)+'.pdf')
    time.sleep(30*(1 + random.random()))

In [51]:
for key in p['data'].keys():
    print(p['data'][key]['author'])


['Marcia McNutt']
['David King']
['Bruce Alberts']
['Johan Rockström']
['Geraldine Richmond']
['Rush Holt']
['Marcia McNutt']
['Marcia McNutt']
['Byung Gwon Lee']
['Marcia McNutt']
['Michael S. Turner']
['S. J. Gates']
['Paul G. Allen']
['Anne Glover']
['Marcia McNutt']
['Julia K. Goodrich', 'Emily R. Davenport', 'Jillian L. Waters', 'Andrew G. Clark', 'Ruth E. Ley']
['Alan I. Leshner']
['Marcelo Sánchez Sorondo', 'Veerabhadran Ramanathan']
['Marcia McNutt']
['Marcia McNutt']
['Guang-Zhong Yang', 'Marcia McNutt']
['Wendy V. Gilbert', 'Tristan A. Bell', 'Cassandra Schaening']
['Marcia McNutt']
['Graeme Reid']
['Jeremy Berg']
['James Wilsdon']
['France A. Córdova']
['Jeremy Berg']
['Helga Nowotny', 'Jana Kolar']
['Jeremy Berg']
['Peter Gluckman']
['Michael T. Osterholm']
['May R. Berenbaum']
['David Baltimore']
['Shaohua Fan', 'Matthew E. B. Hansen', 'Yancy Lo', 'Sarah A. Tishkoff']
['Eric A. Miska', 'Anne C. Ferguson-Smith']
['Subra Suresh', 'Robert A. Bradway']
['Jeremy Berg']
['Patric

In [52]:
len(p['data'])

44