<a href="https://colab.research.google.com/github/carloscastillo10/kbs-scientific-publications/blob/development/notebooks/extract-data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Libraries

In [14]:
from datetime import datetime
import json
import pytz
import requests

## Methods

#### Iso Date to String date

In [16]:
def convert_isodate_to_strdate(iso_date):
  date = datetime.strptime(iso_date, '%Y-%m-%dT%H:%M:%S%fZ')
  string_date = datetime.strftime(date, '%Y-%m-%d %H:%M:%S')
  return string_date

#### Get current date

In [17]:
def get_current_date():
  current_date = datetime.now(pytz.timezone("America/Lima")).strftime('%Y-%m-%d')
  return current_date

#### Get works through scopus api

In [22]:
def set_scopus_body(work):
  scopus_body = {
    'doi': work['prism:doi'], 'title': work['dc:title'], 'url': work['prism:url'], 
    'publication_name': work['prism:publicationName'], 'identifier': work['dc:identifier'], 
    'issn': '', 'article_number': '', 
    'open_access': work['openaccess'], 
    'organization': [], 'citations': [
      set_article_citation('Scopus', 'Número de citas en Scopus', work['citedby-count'])
    ], 
  }
  
  
  if 'prism:issn' in work.keys(): scopus_body['issn'] = work['prism:issn']
  if 'article-number' in work.keys(): scopus_body['article_number'] = work['article-number']
  if 'prism:volume' in work.keys(): scopus_body['volume'] = work['prism:volume'],
  if 'affiliation' in work.keys(): scopus_body['organization'] = [
    {
      'name': organization['affilname'], 
      'city': {'name': organization['affiliation-city']}, 
      'country': {'name': organization['affiliation-country']}
    } for organization in work['affiliation']
  ]

  return scopus_body
  

#### Get article information in crossref using doi

In [9]:
def get_crossref_content(crossref_uri, doi):
  crossref_body = {'created': '', 'deposited': '', 'abstract': '', 'score': '', 'publisher': '', 'language': '', 'subject': '', 'author': []}
  crossref_response = requests.get(f'{crossref_uri}/{doi}')
  if crossref_response.status_code == 200:
    crossref_content = crossref_response.json()['message']
    crossref_body = {
      'created': convert_isodate_to_strdate(crossref_content['created']['date-time']), 
      'deposited': convert_isodate_to_strdate(crossref_content['deposited']['date-time']),
      'abstract': '', 'score': crossref_content['score'], 'publisher': crossref_content['publisher'], 'language': '', 'subject': '',
      'author': []
    }
    if 'abstract' in crossref_content.keys(): crossref_body['abstract'] = crossref_content['abstract']
    if 'subject' in crossref_content.keys(): crossref_body['subject'] = [{'name': subject_name} for subject_name in crossref_content['subject']]
    if 'language' in crossref_content.keys(): crossref_body['language'] = {'acronym': crossref_content['language']}
    if 'author' in crossref_content.keys():
      for author in crossref_content['author']:
        given_name, family_name = '', ''
        if 'given' in author.keys(): given_name  = author['given']
        elif 'give_name' in author.keys():given_name = author['given_name']
        elif 'name' in author.keys(): given_name = author['name']

        if 'family' in author.keys(): family_name = author['family']
        elif 'family_name' in author.keys(): family_name = author['family_name']

        author_detail = {
          'give_name': given_name, 'family_name': family_name, 'sequence': author['sequence'],
          'organization': []
        }

        if 'affiliation' in author.keys():
          author_detail['organization'] = [{
            'name': organization['name'], 'city': {'name': ''}, 'country': {'name': ''}
          } for organization in author['affiliation']]
        crossref_body['author'].append(author_detail)
  return crossref_body

#### Get article information in altmetric using doi

In [10]:
def get_altmetric_content(altmetric_uri, doi, article_body):
  altmetric_response = requests.get(f'{altmetric_uri}/doi/{doi}')
  if altmetric_response.status_code == 200:
    altmetric_content = altmetric_response.json()
    article_body['citations'].extend([
      set_article_citation('Posts', 'Número de publicaciones que mencionan al artículo', altmetric_content['cited_by_posts_count']),
      set_article_citation('Accounts', 'Suma de todas las entradas ”cited_by” que han utilizado el artículo', altmetric_content['cited_by_accounts_count']),
      set_article_citation('Ulike', 'Número de lectores en citeUlike', altmetric_content['readers']['citeulike']),
      set_article_citation('Mendeley', 'Número de lectores en Mendeley', altmetric_content['readers']['mendeley']),
      set_article_citation('Connotea', 'Número de lectores en Connotea', altmetric_content['readers']['connotea'])
    ])

    if 'cited_by_tweeters_count' in altmetric_content.keys(): article_body['citations'].append(
      set_article_citation('Twitter', 'Número de cuentas de twitter que han tuiteado el artículo', altmetric_content['cited_by_tweeters_count'])
    )
    if 'journal' in altmetric_content.keys(): article_body['journal'] = altmetric_content['journal']
  return article_body

In [18]:
def set_article_citation(name, description, number):
  date = get_current_date()
  return {
    'name': name, 
    'description': description, 
    'date': date, 
    'number': int(number)
  }

#### Dictionary to json

In [12]:
def save_articles(articles):
  with open('articles.json', 'w') as file_json:
    json.dump(articles, file_json, indent=4)

## Main

In [19]:
scopus_uri = 'https://api.elsevier.com/content/search/scopus'
crossref_uri = 'https://api.crossref.org/works'
altmetric_uri = 'https://api.altmetric.com/v1'

In [24]:
offset = 0
payload = {
  'query': 'TITLE-ABS-KEY(covid AND Latin AND America)',
  'count': 25,
  'start': offset,
  'sort': 'relevancy',
  'apikey': '759cef3732cf428bb6279e148c6b3768',
}
scopus_response = requests.get(scopus_uri, params=payload)

In [25]:
articles = {'content': []}
while scopus_response.status_code == 200 and 'entry' in scopus_response.json()['search-results'].keys():
  works = scopus_response.json()['search-results']['entry']
  print(offset)
  for work in works:
    try:
      if 'prism:doi' in work.keys():
        doi = work['prism:doi']
        article_body = set_scopus_body(work)
        
        # Get article information in crossref using doi 
        crossref_body = get_crossref_content(crossref_uri, doi)
        article_body.update(crossref_body)

        # Get article information in altmetric using doi
        altmetric_body = get_altmetric_content(altmetric_uri, doi, article_body)
        article_body.update(altmetric_body)
        articles['content'].append(article_body)
    except Exception as e:
      print(e)
  offset += 25
  payload['start'] = offset
  scopus_response = requests.get(scopus_uri, params=payload)

0
25
50
75
100
125
150
175
200
225
250
275
300
325
350
375
400
425
450
475
500
525
550
575
600
625
650
675
700
725
750
775
800
825
850
875
900
925
950
975
1000
1025
1050


In [29]:
len(articles['content'])

1022

In [28]:
save_articles(articles)