In [13]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import time
import json
import random

## Database Information 

Database Name: Paper_metadata.json

Data Source: sciencemag.org

Metadata: Title, Publication Year, Author, Category, Institution, DOI, Subject, Content:( Abstract, Citation, Full Content)

### JSON Structure

{"index":metadata, Content:{Abstract, Citation}}

## Webpage Structure

The archives in sicencemag/conten/by/year

We can find search paper by year and week

<img src='../pic/archives_by_year.png' style="width: 600px;"/>

Click one we can get paper links

<img src='../pic/contents_by_week.png' style="width: 600px;"/>

Then Analyze the links we can get some metadata and documents

In [5]:
basic_url='http://science.sciencemag.org'

In [32]:
# too many urls here, scrap the urls first
def scrapingURL(range1=2010,range2=2017):
    #########################################
    #
    # Func scrapingUPL will return urlList
    # urlList:{year:{week:{}}}
    #
    #########################################
    
    global basic_url
    urlList = {}
    #by years
    for year in range(range1,range2+1):
        #url address
        html=basic_url+'/content/by/year/'+str(year)
        
        #directory[year]
        urlList[year]={}
        
        #open the html
        b0 = urlopen(html)
        bs0bj = BeautifulSoup(b0,'html5lib')
        
        #url list
        list = bs0bj.find('div',{'class':'pane-content'})\
        .findAll('a',{'class':'highwire-cite-linked-title'})
        
        #
        for i in range(len(list)):
            #weekly lests
            urlList[year][list[i].get_text()[0:6]]=[]
            
            link=list[i].get('href')
            b1 = urlopen(basic_url + link)
            bs1bj = BeautifulSoup(b1, 'html5lib')
            
            #paper links
            fullList = bs1bj.findAll('a', {'title': 'Full Text'})
            
            #store links
            for content in fullList:
                textUrl = content.get('href')
                urlList[year][list[i].get_text()[0:6]].append(textUrl)
            
            #sleep
            time.sleep(2 * (1+random.random()) )
            print("finished: "+str(year)+' '+list[i].get_text()[0:6])
            return urlList

In [33]:
urls = scrapingURL(range1=2017,range2=2017)

finished: 2017 Jan 06


In [48]:
def scraping(basic_url,urlList,year,typeList=['Editorial','Research Article','Review Article','Letter','Book Review']):
    #text data
    textList = {}
    for week in urlList[year]:
        #classified by weeks
        textList[week] = []
        for content in urlList[year][week]:
            #read url
            b1 = urlopen(basic_url + content)
            bs1bj = BeautifulSoup(b1, 'html5lib')
            # check if there is article categories
            # Since there are many redundant information
            try:
                articleType = bs1bj.find('meta', {'name': 'citation_article_type'}).get('content')
            except:
                articleType = 'none'
            # while the type of paper is in the list( We only scrap several specific types of articles)
            # 'Editorial','Research Article','Review Article','Letter','Book Review'
            if articleType in typeList:
                #title
                title = bs1bj.find('meta', {'name': 'citation_title'}).get('content')
                
                #Author
                authorList = bs1bj.findAll('meta', {'name': 'citation_author'})
                author = []
                for m in authorList:
                    author.append(m.get('content'))
                
                #Institution
                institutionList = bs1bj.findAll('meta', {'name': 'citation_author_institution'})
                institution = []
                for InstiData in institutionList:
                    institution.append(InstiData.get('content'))
                    
                #Publisher
                publisher = bs1bj.find('meta', {'name': 'citation_publisher'}).get('content')
                
                #Publication data
                pubDate = bs1bj.find('meta', {'name': 'citation_publication_date'}).get('content')
                
                #DOI
                doi = bs1bj.find('meta', {'name': 'citation_doi'}).get('content')
                
                ##PDF (need to be verified)
                pdfLink = bs1bj.find('link', {'title': 'Full Text (PDF)'}).get('href')
                
                ##text (need to be verified)
                textLink = bs1bj.find('link', {'title': 'Full Text (Plain)'}).get('href')
                
                # put them in to directory
                textDict = {}
                textDict['type'] = articleType
                textDict['title'] = title
                textDict['author'] = author
                textDict['authorInstitution'] = institution
                textDict['publisher'] = publisher
                textDict['date'] = pubDate
                textDict['doi'] = doi
                textDict['pdf'] = pdfLink
                textDict['textList'] = textList
                # content
                textDict['text'] = ''
                textDict['abstract'] = ''
                textDict['introduction'] = ''
                textDict['discussion'] = ''

                # Data for function to search by abstrace, introduction, discussion part
                textParts = bs1bj.findAll('p', {'id': re.compile('p\-[0-9]*')})
                for finding in textParts:
                    textDict['text'] += finding.get_text().replace('\n',' ') + ' '
                # Abstract
                # use try/except since articles may not have abstract
                try:
                    abstractParts = bs1bj.findAll('div', {'class': 'section abstract'})
                    for finding in abstractParts:
                        textDict['abstract'] += finding.get_text().replace('\n',' ') + ' '
                except:
                    textDict['abstract'] = None
                    
                # Introduction
                try:
                    introParts = bs1bj.findAll('div', {'class': 'section introduction'})
                    for finding in introParts:
                        textDict['introduction'] +=finding.get_text().replace('\n',' ') + ' '
                except:
                    textDict['introduction'] = None
                    
                # Discussion
                try:
                    discussionParts = bs1bj.findAll('div', {'class': 'section discussion'})
                    for finding in discussionParts:
                        textDict['discussion'] +=  finding.get_text().replace('\n',' ') + ' '
                except:
                    textDict['discussion'] = None

                textList[week].append(textDict)
                
                time.sleep(2 * (1+random.random()) )

        print('finished:' + week)
    return textList

In [49]:
def saveasJSON(textList,year):
    model= textList #数据
    with open("../data/json/test.json",'w',encoding='utf-8') as json_file:
        maximumIndex = json_file['maximumIndex']
        
    with open("../data/json/test/data.json",'w',encoding='utf-8') as json_file:
        maximumIndex = json_file['maximumIndex']
        for week in model:
            jsonDataList = model[week]
            for d in jsonDataList:
                d
                json.dump(d,json_file,ensure_ascii=False)
            

In [50]:
textList = scraping(basic_url,urls,2017)

finished:Jan 06


In [52]:
model= {'maximumIndex':0, 'data':{}} #数据
with open("../data/json/test.json",'w',encoding='utf-8') as json_file:
    json.dump(model,json_file,ensure_ascii=False)

In [53]:
textList

{'Jan 06': [{'abstract': '',
   'author': ['Jeremy Berg'],
   'authorInstitution': ['Editor-in-Chief, Science Journals.'],
   'date': '2017/01/06',
   'discussion': '',
   'doi': '10.1126/science.aam6743',
   'introduction': '',
   'pdf': '/content/355/6320/9.full.pdf',
   'publisher': 'American Association for the Advancement of Science',
   'text': "A year ago, Science's Editor-in-Chief Marcia McNutt highlighted two new journals in the Science family. Indeed, with the 2016 launches of Science Immunology and Science Robotics, the Science family now has six members including, in addition, Science, Science Signaling, Science Translational Medicine, and Science Advances. This growth has occurred through a number of distinct opportunities, involving the emergence of new multidisciplinary research areas as well as alternative ways to accelerate the communication of research. The family now embodies a range of publishing and editorial models, with the content of Science Robotics and the ope

In [56]:
with open("../data/json/test.json",'w',encoding='utf-8') as json_file:
    print(json_file)


<_io.TextIOWrapper name='../data/json/test.json' mode='w' encoding='utf-8'>
