In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import unidecode
import json
import re
from typing import Any, Dict, List, Callable
from datetime import datetime

# Load JSON Data File

## Why are we doing this ?

We extract the HAL API results and store them in a json file.

If someone edits an article the `submittedDate_tdate` in HAL API results will change

### Example

Imagine the following use case :

| DATE       | ACTION                                                                  | RESULT                                                                          |
|:----------:|:------------------------------------------------------------------------|:--------------------------------------------------------------------------------|
| 2021-02-14 | An authors submitted the document with a preprint versions              | The document `sample_uri` is created with `submittedDate_tdate=2021-02-14`      |
| 2021-07-10 | We extract all articles submitted between `2017-01-01` and `2021-07-05` | The document `sample_uri` is in the exported data                               |
| 2021-07-17 | The publisher change the document to submit the printed versions        | The document `sample_uri` is updated. The `submittedDate_tdate` will be updated to `submittedDate_tdate=2021-07-17` |
| 2021-07-21 | We extract all articles submitted between `2017-01-01` and `2021-07-05` | The document `sample_uri` is **NOT ANYMORE** in the exported data               |

### Is there a solution ?

If fact, there is no magic here...

We have to retrieve **manually** all articles between `2021-07-05` and `NOW` and filtered the document that should be integrated in the exported data !

## Constants

Define primary constants

* `LAB_STRUCT_ID` : The identifier in HAL of the CES
* `EXPORT_DATE` : The current date as string

In [2]:
LAB_STRUCT_ID = 15080
EXPORT_DATE = datetime.now().strftime("%Y-%m-%dT%H-%M")

## Datasets

To retrieve the data we will work on, we're using the HAL API. This API is very well documented here :

* Getting started : https://api.archives-ouvertes.fr/docs/search
* Fields : https://api.archives-ouvertes.fr/docs/search/?schema=fields#fields

To retrieve the data we will work on, we're using the HAL API. This API is very well documented here :

* Getting started : https://api.archives-ouvertes.fr/docs/search
* Fields : https://api.archives-ouvertes.fr/docs/search/?schema=fields#fields


### Calculate number of documents for the specified period

In [3]:
filter_queries = dict(
    publicationDate_tdate='[2017-01-01T00:00:00Z TO 2022-12-31T23:59:59Z]',
#    submittedDate_tdate='[2017-01-01T00:00:00Z TO 2022-12-01T23:59:59Z]',
#    docType_s='ART'
)
response_fields = [
    'label_s',
    'uri_s',
    'keyword_s',
    'audience_s',
    'authEmail_s',
    'authFullName_s',
    'authFirstName_s',
    'authLastName_s',
    'authStructId_i',
    'authIdHal_i',
    'authIdFullName_fs',
    'authIdHasPrimaryStructure_fs',
    'authIdHasStructure_fs',
    'docType_s',
    'keyword_s',
    'jel_s',
    'journalIssn_s',
    'journalTitle_s',
    'anrProjectReference_s',
    'anrProjectTitle_s',
    'anrProjectAcronym_s',
    'europeanProjectReference_s',
    'europeanProjectReference_s',
    'europeanProjectAcronym_s',
    'funding_s',
    'instStructCountry_s',
    'instStructName_s',
    'files_s',
    'language_s',
    'labStructAcronym_s',
    'labStructCode_s',
    'contributorFullName_s',
    'openAccess_bool',
    'peerReviewing_s',
    'peerReviewing_t',
]
params = dict(
    wt='json',
    q=f'labStructId_i:{LAB_STRUCT_ID}',
    fq=[f'{k}:{v}' for k, v in filter_queries.items()],
    fl=','.join(response_fields)
)
# Count the number of rows
r = requests.get('http://api.archives-ouvertes.fr/search', params={**params | dict(rows=0)})
num_found = r.json().get('response', dict()).get('numFound', 0)
print(f'The is {num_found} documents response for query :\n{json.dumps(params, indent=2)}')

The is 1559 documents response for query :
{
  "wt": "json",
  "q": "labStructId_i:15080",
  "fq": [
    "publicationDate_tdate:[2017-01-01T00:00:00Z TO 2022-12-31T23:59:59Z]"
  ],
  "fl": "label_s,uri_s,keyword_s,audience_s,authEmail_s,authFullName_s,authFirstName_s,authLastName_s,authStructId_i,authIdHal_i,authIdFullName_fs,authIdHasPrimaryStructure_fs,authIdHasStructure_fs,docType_s,keyword_s,jel_s,journalIssn_s,journalTitle_s,anrProjectReference_s,anrProjectTitle_s,anrProjectAcronym_s,europeanProjectReference_s,europeanProjectReference_s,europeanProjectAcronym_s,funding_s,instStructCountry_s,instStructName_s,files_s,language_s,labStructAcronym_s,labStructCode_s,contributorFullName_s,openAccess_bool,peerReviewing_s,peerReviewing_t"
}


### Writing all the documents in a result files 

In [4]:
r = requests.get('http://api.archives-ouvertes.fr/search', params={**params | dict(rows=num_found)})
print(f'Read the documents : {r.url}')

with open(f'./result.{EXPORT_DATE}.json', 'w') as f:
    f.write(r.text)

Read the documents : http://api.archives-ouvertes.fr/search?wt=json&q=labStructId_i%3A15080&fq=publicationDate_tdate%3A%5B2017-01-01T00%3A00%3A00Z+TO+2022-12-31T23%3A59%3A59Z%5D&fl=label_s%2Curi_s%2Ckeyword_s%2Caudience_s%2CauthEmail_s%2CauthFullName_s%2CauthFirstName_s%2CauthLastName_s%2CauthStructId_i%2CauthIdHal_i%2CauthIdFullName_fs%2CauthIdHasPrimaryStructure_fs%2CauthIdHasStructure_fs%2CdocType_s%2Ckeyword_s%2Cjel_s%2CjournalIssn_s%2CjournalTitle_s%2CanrProjectReference_s%2CanrProjectTitle_s%2CanrProjectAcronym_s%2CeuropeanProjectReference_s%2CeuropeanProjectReference_s%2CeuropeanProjectAcronym_s%2Cfunding_s%2CinstStructCountry_s%2CinstStructName_s%2Cfiles_s%2Clanguage_s%2ClabStructAcronym_s%2ClabStructCode_s%2CcontributorFullName_s%2CopenAccess_bool%2CpeerReviewing_s%2CpeerReviewing_t&rows=1559


### Dataset preview 

In [5]:
ddf = pd.DataFrame.from_dict(r.json().get('response').get('docs'))
ddf.head()

Unnamed: 0,label_s,openAccess_bool,audience_s,authLastName_s,authFirstName_s,authFullName_s,authIdFullName_fs,authIdHasPrimaryStructure_fs,authIdHasStructure_fs,authIdHal_i,...,journalTitle_s,journalIssn_s,authStructId_i,funding_s,anrProjectTitle_s,anrProjectAcronym_s,anrProjectReference_s,files_s,europeanProjectAcronym_s,europeanProjectReference_s
0,"Hamza El Khalloufi, Pierre-Charles Pradier. Th...",True,2.0,"[El Khalloufi, Pradier]","[Hamza, Pierre-Charles]","[Hamza El Khalloufi, Pierre-Charles Pradier]","[0_FacetSep_Hamza El Khalloufi, 11901_FacetSep...",[1076003-0_FacetSep_Hamza El Khalloufi_JoinSep...,[1076003-0_FacetSep_Hamza El Khalloufi_JoinSep...,[11901],...,,,,,,,,,,
1,Coralie Perez. Avec le Compte Personnel de For...,False,2.0,[Perez],[Coralie],[Coralie Perez],[4756_FacetSep_Coralie Perez],[7585-4756_FacetSep_Coralie Perez_JoinSep_1508...,[7585-4756_FacetSep_Coralie Perez_JoinSep_1508...,[4756],...,Savoirs : Revue internationale de recherches e...,1763-4229,[15080],,,,,,,
2,"Cuong Le Van, Nguyen To The. Farmers’ adoption...",True,2.0,"[Le Van, To The]","[Cuong, Nguyen]","[Cuong Le Van, Nguyen To The]","[0_FacetSep_Cuong Le Van, 0_FacetSep_Nguyen To...",[165984-0_FacetSep_Cuong Le Van_JoinSep_542840...,[165984-0_FacetSep_Cuong Le Van_JoinSep_542840...,,...,Asia-Pacific Journal of Regional Science,2509-7946,,[Cuong Le Van is partially funded by Vietnam N...,,,,,,
3,"Cuong Le Van, Ngoc-Sang Pham. Demand and equil...",True,2.0,"[Le Van, Pham]","[Cuong, Ngoc-Sang]","[Cuong Le Van, Ngoc-Sang Pham]","[0_FacetSep_Cuong Le Van, 0_FacetSep_Ngoc-Sang...",[165984-0_FacetSep_Cuong Le Van_JoinSep_542840...,[165984-0_FacetSep_Cuong Le Van_JoinSep_542840...,,...,Mathematical Social Sciences,0165-4896,,"[For this research, Cuong Le Van is funded by ...",[Entrepreneurship],[Entreprendre],[ANR-10-LABX-0011],,,
4,"Guillaume Gaulier, Aude Sztulman, Deniz Ünal. ...",True,,"[Gaulier, Sztulman, Ünal]","[Guillaume, Aude, Deniz]","[Guillaume Gaulier, Aude Sztulman, Deniz Ünal]","[759678_FacetSep_Guillaume Gaulier, 0_FacetSep...",[308488-759678_FacetSep_Guillaume Gaulier_Join...,[308488-759678_FacetSep_Guillaume Gaulier_Join...,[759678],...,,,,,,,,[https://hal.archives-ouvertes.fr/hal-02315466...,,
