In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import unidecode
import json
import re
from typing import Any, Dict, List, Callable
from datetime import datetime

# Load JSON Data File

## Why are we doing this ?

We extract the HAL API results and store them in a json file.

If someone edits an article the `submittedDate_tdate` in HAL API results will change

### Example

Imagine the following use case :

| DATE       | ACTION                                                                  | RESULT                                                                          |
|:----------:|:------------------------------------------------------------------------|:--------------------------------------------------------------------------------|
| 2021-02-14 | An authors submitted the document with a preprint versions              | The document `sample_uri` is created with `submittedDate_tdate=2021-02-14`      |
| 2021-07-10 | We extract all articles submitted between `2017-01-01` and `2021-07-05` | The document `sample_uri` is in the exported data                               |
| 2021-07-17 | The publisher change the document to submit the printed versions        | The document `sample_uri` is updated. The `submittedDate_tdate` will be updated to `submittedDate_tdate=2021-07-17` |
| 2021-07-21 | We extract all articles submitted between `2017-01-01` and `2021-07-05` | The document `sample_uri` is **NOT ANYMORE** in the exported data               |

### Is there a solution ?

If fact, there is no magic here...

We have to retrieve **manually** all articles between `2021-07-05` and `NOW` and filtered the document that should be integrated in the exported data !

## Constants

Define primary constants

* `LAB_STRUCT_ID` : The identifier in HAL of the CES
* `EXPORT_DATE` : The current date as string

In [2]:
LAB_STRUCT_ID = 15080
EXPORT_DATE = datetime.now().strftime("%Y-%m-%dT%H-%M")

## Datasets

To retrieve the data we will work on, we're using the HAL API. This API is very well documented here :

* Getting started : https://api.archives-ouvertes.fr/docs/search
* Fields : https://api.archives-ouvertes.fr/docs/search/?schema=fields#fields

To retrieve the data we will work on, we're using the HAL API. This API is very well documented here :

* Getting started : https://api.archives-ouvertes.fr/docs/search
* Fields : https://api.archives-ouvertes.fr/docs/search/?schema=fields#fields


### Calculate number of documents for the specified period

In [3]:
filter_queries = dict(
    publicationDate_tdate='[2017-01-01T00:00:00Z TO 2022-12-31T23:59:59Z]',
#    submittedDate_tdate='[2017-01-01T00:00:00Z TO 2022-12-01T23:59:59Z]',
#    docType_s='ART'
)
response_fields = [
    'label_s',
    'uri_s',
    'keyword_s',
    'audience_s',
    'authEmail_s',
    'authFullName_s',
    'authFirstName_s',
    'authLastName_s',
    'authStructId_i',
    'authIdHal_i',
    'authIdFullName_fs',
    'authIdHasPrimaryStructure_fs',
    'authIdHasStructure_fs',
    'docType_s',
    'keyword_s',
    'jel_s',
    'journalIssn_s',
    'journalTitle_s',
    'anrProjectReference_s',
    'anrProjectTitle_s',
    'anrProjectAcronym_s',
    'europeanProjectReference_s',
    'europeanProjectReference_s',
    'europeanProjectAcronym_s',
    'funding_s',
    'instStructCountry_s',
    'instStructName_s',
    'files_s',
    'language_s',
    'labStructAcronym_s',
    'labStructCode_s',
    'contributorFullName_s',
    'openAccess_bool',
    'peerReviewing_s',
    'peerReviewing_t',
]
params = dict(
    wt='json',
    q=f'labStructId_i:{LAB_STRUCT_ID}',
    fq=[f'{k}:{v}' for k, v in filter_queries.items()],
    fl=','.join(response_fields)
)
# Count the number of rows
r = requests.get('http://api.archives-ouvertes.fr/search', params={**params | dict(rows=0)})
num_found = r.json().get('response', dict()).get('numFound', 0)
print(f'The is {num_found} documents response for query :\n{json.dumps(params, indent=2)}')

The is 1607 documents response for query :
{
  "wt": "json",
  "q": "labStructId_i:15080",
  "fq": [
    "publicationDate_tdate:[2017-01-01T00:00:00Z TO 2022-12-31T23:59:59Z]"
  ],
  "fl": "label_s,uri_s,keyword_s,audience_s,authEmail_s,authFullName_s,authFirstName_s,authLastName_s,authStructId_i,authIdHal_i,authIdFullName_fs,authIdHasPrimaryStructure_fs,authIdHasStructure_fs,docType_s,keyword_s,jel_s,journalIssn_s,journalTitle_s,anrProjectReference_s,anrProjectTitle_s,anrProjectAcronym_s,europeanProjectReference_s,europeanProjectReference_s,europeanProjectAcronym_s,funding_s,instStructCountry_s,instStructName_s,files_s,language_s,labStructAcronym_s,labStructCode_s,contributorFullName_s,openAccess_bool,peerReviewing_s,peerReviewing_t"
}


### Writing all the documents in a result files 

In [4]:
r = requests.get('http://api.archives-ouvertes.fr/search', params={**params | dict(rows=num_found)})
print(f'Read the documents : {r.url}')

with open(f'./result.{EXPORT_DATE}.json', 'w') as f:
    f.write(r.text)

Read the documents : http://api.archives-ouvertes.fr/search?wt=json&q=labStructId_i%3A15080&fq=publicationDate_tdate%3A%5B2017-01-01T00%3A00%3A00Z+TO+2022-12-31T23%3A59%3A59Z%5D&fl=label_s%2Curi_s%2Ckeyword_s%2Caudience_s%2CauthEmail_s%2CauthFullName_s%2CauthFirstName_s%2CauthLastName_s%2CauthStructId_i%2CauthIdHal_i%2CauthIdFullName_fs%2CauthIdHasPrimaryStructure_fs%2CauthIdHasStructure_fs%2CdocType_s%2Ckeyword_s%2Cjel_s%2CjournalIssn_s%2CjournalTitle_s%2CanrProjectReference_s%2CanrProjectTitle_s%2CanrProjectAcronym_s%2CeuropeanProjectReference_s%2CeuropeanProjectReference_s%2CeuropeanProjectAcronym_s%2Cfunding_s%2CinstStructCountry_s%2CinstStructName_s%2Cfiles_s%2Clanguage_s%2ClabStructAcronym_s%2ClabStructCode_s%2CcontributorFullName_s%2CopenAccess_bool%2CpeerReviewing_s%2CpeerReviewing_t&rows=1607


### Dataset preview 

In [5]:
ddf = pd.DataFrame.from_dict(r.json().get('response').get('docs'))
ddf.head()

Unnamed: 0,label_s,openAccess_bool,peerReviewing_s,audience_s,jel_s,keyword_s,journalTitle_s,journalIssn_s,anrProjectTitle_s,anrProjectAcronym_s,...,instStructCountry_s,contributorFullName_s,language_s,uri_s,docType_s,files_s,authStructId_i,funding_s,europeanProjectAcronym_s,europeanProjectReference_s
0,"Nicolas Jacquemet, Stéphane Luchini, Jason F S...",True,1.0,2.0,"[C.C7.C72, D.D8.D83]","[Coordination game, Cheap talk communication, ...",Experimental Economics,1386-4157,"[Opening economics, PROJET AVENIR LYON SAINT-E...","[OSE, Avenir L.S.E., BECOA]",...,"[fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, f...",Nicolas Jacquemet,[en],https://shs.hal.science/halshs-01480525,ART,[https://shs.hal.science/halshs-01480525/file/...,,,,
1,"Lionel Fontagné, Jean Fouré, Alexander Keck. S...",True,1.0,2.0,"[E.E2.E27, F.F0.F02, F.F1.F17, F.F4.F47]","[International trade, Macroeconomic projection...",The World Economy,0378-5920,,,...,"[fr, fr, fr, fr, fr, fr, fr, fr]",Lionel Fontagné,[en],https://hal.science/hal-01416567,ART,,[15080],,,
2,"Ziaul Haque Munim, Hercules Haralambides. Adva...",True,1.0,2.0,,,Maritime Economics and Logistics,1479-2931,,,...,"[no, cn, fr]",Amélie COLLIN,[en],https://hal.science/hal-04046263,ART,,,,,
3,"Armagan Tuna Aktuna-Gunes, Okay Gunes. Measuri...",True,,,"[J.J2.J22, D.D1.D12, D.D1.D13]","[Household production technology, Matching sta...",,,,,...,"[fr, fr, fr, fr, fr, fr]",Lucie Label,[en],https://shs.hal.science/halshs-01491982,OTHER,[https://shs.hal.science/halshs-01491982/file/...,,,,
4,"Marc Fleurbaey, Aurélie Méjean, Antonin Pottie...",True,,,"[D.D6.D63, D.D8.D81]","[Inequality, Fairness, Climate change-related ...",Documents de travail du Centre d'Économie de l...,1955-611X,"[Equité, changement climatique et population, ...","[FAIR-CLIMPOP, PGSE]",...,"[fr, fr, fr, fr, fr, fr, fr, fr, fr, fr, fr]",Lucie Label,[en],https://shs.hal.science/halshs-03048370,OTHER,[https://shs.hal.science/halshs-03048370/file/...,[441569],,,
