In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import unidecode
import json
import re
from typing import Any, Dict, List, Callable
from datetime import datetime

# Load JSON Data File

## Why are we doing this ?

We extract the HAL API results and store them in a json file.

If someone edits an article the `submittedDate_tdate` in HAL API results will change

### Example

Imagine the following use case :

| DATE       | ACTION                                                                  | RESULT                                                                          |
|:----------:|:------------------------------------------------------------------------|:--------------------------------------------------------------------------------|
| 2021-02-14 | An authors submitted the document with a preprint versions              | The document `sample_uri` is created with `submittedDate_tdate=2021-02-14`      |
| 2021-07-10 | We extract all articles submitted between `2017-01-01` and `2021-07-05` | The document `sample_uri` is in the exported data                               |
| 2021-07-17 | The publisher change the document to submit the printed versions        | The document `sample_uri` is updated. The `submittedDate_tdate` will be updated to `submittedDate_tdate=2021-07-17` |
| 2021-07-21 | We extract all articles submitted between `2017-01-01` and `2021-07-05` | The document `sample_uri` is **NOT ANYMORE** in the exported data               |

### Is there a solution ?

If fact, there is no magic here...

We have to retrieve **manually** all articles between `2021-07-05` and `NOW` and filtered the document that should be integrated in the exported data !

## Constants

Define primary constants

* `LAB_STRUCT_ID` : The identifier in HAL of the CES
* `EXPORT_DATE` : The current date as string

In [2]:
LAB_STRUCT_ID = 15080
EXPORT_DATE = datetime.now().strftime("%Y-%m-%dT%H-%M")

## Datasets

To retrieve the data we will work on, we're using the HAL API. This API is very well documented here :

* Getting started : https://api.archives-ouvertes.fr/docs/search
* Fields : https://api.archives-ouvertes.fr/docs/search/?schema=fields#fields

To retrieve the data we will work on, we're using the HAL API. This API is very well documented here :

* Getting started : https://api.archives-ouvertes.fr/docs/search
* Fields : https://api.archives-ouvertes.fr/docs/search/?schema=fields#fields


### Calculate number of documents for the specified period

In [3]:
filter_queries = dict(
    publicationDate_tdate='[2017-01-01T00:00:00Z TO 2021-07-01T00:00:00Z]',
    submittedDate_tdate='[2010-01-01T00:00:00Z TO 2021-07-05T00:00:00Z]',
    docType_s='ART'
)
response_fields = [
    'label_s',
    'uri_s',
    'keyword_s',
    'authEmail_s',
    'authFullName_s',
    'authFirstName_s',
    'authLastName_s',
    'authStructId_i',
    'authId_i',
    'authIdHasPrimaryStructure_fs',
    'authIdHasStructure_fs',
    'keyword_s',
    'jel_s',
    'journalIssn_s',
    'journalTitle_s',
    'anrProjectReference_s',
    'anrProjectTitle_s',
    'anrProjectAcronym_s',
    'europeanProjectReference_s',
    'europeanProjectReference_s',
    'europeanProjectAcronym_s',
    'funding_s',
    'instStructCountry_s',
    'instStructName_s',
    'files_s',
    'language_s',
    'labStructAcronym_s',
    'labStructCode_s',
    'contributorFullName_s',
    'openAccess_bool',
]
params = dict(
    wt='json',
    q=f'labStructId_i:{LAB_STRUCT_ID}',
    fq=[f'{k}:{v}' for k, v in filter_queries.items()],
    fl=','.join(response_fields)
)
# Count the number of rows
r = requests.get('http://api.archives-ouvertes.fr/search', params={**params | dict(rows=0)})
num_found = r.json().get('response', dict()).get('numFound', 0)
print(f'The is {num_found} documents response for query :\n{json.dumps(params, indent=2)}')

The is 475 documents response for query :
{
  "wt": "json",
  "q": "labStructId_i:15080",
  "fq": [
    "publicationDate_tdate:[2017-01-01T00:00:00Z TO 2021-07-01T00:00:00Z]",
    "submittedDate_tdate:[2010-01-01T00:00:00Z TO 2021-07-05T00:00:00Z]",
    "docType_s:ART"
  ],
  "fl": "label_s,uri_s,keyword_s,authEmail_s,authFullName_s,authFirstName_s,authLastName_s,authStructId_i,authId_i,authIdHasPrimaryStructure_fs,authIdHasStructure_fs,keyword_s,jel_s,journalIssn_s,journalTitle_s,anrProjectReference_s,anrProjectTitle_s,anrProjectAcronym_s,europeanProjectReference_s,europeanProjectReference_s,europeanProjectAcronym_s,funding_s,instStructCountry_s,instStructName_s,files_s,language_s,labStructAcronym_s,labStructCode_s,contributorFullName_s,openAccess_bool"
}


### Writing all the documents in a result files 

In [4]:
r = requests.get('http://api.archives-ouvertes.fr/search', params={**params | dict(rows=num_found)})
print(f'Read the documents : {r.url}')

with open(f'./result.{EXPORT_DATE}.json', 'w') as f:
    f.write(r.text)

Read the documents : http://api.archives-ouvertes.fr/search?wt=json&q=labStructId_i%3A15080&fq=publicationDate_tdate%3A%5B2017-01-01T00%3A00%3A00Z+TO+2021-07-01T00%3A00%3A00Z%5D&fq=submittedDate_tdate%3A%5B2010-01-01T00%3A00%3A00Z+TO+2021-07-05T00%3A00%3A00Z%5D&fq=docType_s%3AART&fl=label_s%2Curi_s%2Ckeyword_s%2CauthEmail_s%2CauthFullName_s%2CauthFirstName_s%2CauthLastName_s%2CauthStructId_i%2CauthId_i%2CauthIdHasPrimaryStructure_fs%2CauthIdHasStructure_fs%2Ckeyword_s%2Cjel_s%2CjournalIssn_s%2CjournalTitle_s%2CanrProjectReference_s%2CanrProjectTitle_s%2CanrProjectAcronym_s%2CeuropeanProjectReference_s%2CeuropeanProjectReference_s%2CeuropeanProjectAcronym_s%2Cfunding_s%2CinstStructCountry_s%2CinstStructName_s%2Cfiles_s%2Clanguage_s%2ClabStructAcronym_s%2ClabStructCode_s%2CcontributorFullName_s%2CopenAccess_bool&rows=475


### Comparaison with a previous CSV file

This is used to compare the current export with a previous result file if there is one to find changes.

As input previous data, we have a CSV file. This CSV file should contains the following columns: 
* `uri_s`
* `count`

In [5]:
df_now = pd.DataFrame.from_dict(r.json().get('response').get('docs'))
df_previous = pd.read_csv('./uri.base.csv', sep=',')

In [6]:
df_merge = df_previous.rename(columns={'count': 'yesterday'}).merge(df_now.rename(columns={'label_s': 'today'})[['uri_s', 'today']], how='left', on='uri_s')
df_merge = df_merge[df_merge['today'].isnull()]['uri_s'].tolist()
df_merge

['https://halshs.archives-ouvertes.fr/halshs-02334593',
 'https://hal-amu.archives-ouvertes.fr/hal-02111159',
 'https://halshs.archives-ouvertes.fr/halshs-03201787',
 'https://hal.archives-ouvertes.fr/hal-01577452',
 'https://hal-cnam.archives-ouvertes.fr/hal-02485454']

## Dataset corrections



### Diff until now

As some editor may replace the submitted text, the resulting document is not available anymore for the `submittedDate_tdate` in the previous resulting query.

We have to read the difference with the now on the field `submittedDate_tdate` to retrieve the right documents.

In [7]:
filter_queries = dict(
    publicationDate_tdate='[2017-01-01T00:00:00Z TO 2021-07-01T00:00:00Z]',
    submittedDate_tdate='[2021-07-05T00:00:00Z TO NOW]',
    docType_s='ART'
)
# We will use the same response fields
# response_fields = []
params = dict(
    wt='json',
    q=f'labStructId_i:{LAB_STRUCT_ID}',
    fq=[f'{k}:{v}' for k, v in filter_queries.items()],
    fl=','.join(response_fields)
)
# Count the number of rows
r = requests.get('http://api.archives-ouvertes.fr/search', params={**params | dict(rows=0)})
num_found = r.json().get('response', dict()).get('numFound', 0)
print(f'The is {num_found} documents response for query :\n{json.dumps(params, indent=2)}')

The is 33 documents response for query :
{
  "wt": "json",
  "q": "labStructId_i:15080",
  "fq": [
    "publicationDate_tdate:[2017-01-01T00:00:00Z TO 2021-07-01T00:00:00Z]",
    "submittedDate_tdate:[2021-07-05T00:00:00Z TO NOW]",
    "docType_s:ART"
  ],
  "fl": "label_s,uri_s,keyword_s,authEmail_s,authFullName_s,authFirstName_s,authLastName_s,authStructId_i,authId_i,authIdHasPrimaryStructure_fs,authIdHasStructure_fs,keyword_s,jel_s,journalIssn_s,journalTitle_s,anrProjectReference_s,anrProjectTitle_s,anrProjectAcronym_s,europeanProjectReference_s,europeanProjectReference_s,europeanProjectAcronym_s,funding_s,instStructCountry_s,instStructName_s,files_s,language_s,labStructAcronym_s,labStructCode_s,contributorFullName_s,openAccess_bool"
}


In [8]:
r = requests.get('http://api.archives-ouvertes.fr/search', params={**params | dict(rows=num_found)})
print(f'Read the documents : {r.url}')

with open(f'./result-diff.{EXPORT_DATE}.json', 'w') as f:
    f.write(r.text)

Read the documents : http://api.archives-ouvertes.fr/search?wt=json&q=labStructId_i%3A15080&fq=publicationDate_tdate%3A%5B2017-01-01T00%3A00%3A00Z+TO+2021-07-01T00%3A00%3A00Z%5D&fq=submittedDate_tdate%3A%5B2021-07-05T00%3A00%3A00Z+TO+NOW%5D&fq=docType_s%3AART&fl=label_s%2Curi_s%2Ckeyword_s%2CauthEmail_s%2CauthFullName_s%2CauthFirstName_s%2CauthLastName_s%2CauthStructId_i%2CauthId_i%2CauthIdHasPrimaryStructure_fs%2CauthIdHasStructure_fs%2Ckeyword_s%2Cjel_s%2CjournalIssn_s%2CjournalTitle_s%2CanrProjectReference_s%2CanrProjectTitle_s%2CanrProjectAcronym_s%2CeuropeanProjectReference_s%2CeuropeanProjectReference_s%2CeuropeanProjectAcronym_s%2Cfunding_s%2CinstStructCountry_s%2CinstStructName_s%2Cfiles_s%2Clanguage_s%2ClabStructAcronym_s%2ClabStructCode_s%2CcontributorFullName_s%2CopenAccess_bool&rows=33


### Extract the URI and manually validate them


In [9]:
ddf = pd.DataFrame.from_dict(r.json().get('response').get('docs'))
ddfu = ddf[['uri_s']]
ddfu.to_csv(f'./diff.result.{EXPORT_DATE}.csv', index=False)
ddfu

Unnamed: 0,uri_s
0,https://hal.archives-ouvertes.fr/hal-03153465
1,https://hal.archives-ouvertes.fr/hal-03329203
2,https://halshs.archives-ouvertes.fr/halshs-032...
3,https://hal.archives-ouvertes.fr/hal-03281809
4,https://halshs.archives-ouvertes.fr/halshs-033...
5,https://hal.archives-ouvertes.fr/hal-03350974
6,https://hal-sciencespo.archives-ouvertes.fr/ha...
7,https://hal.archives-ouvertes.fr/hal-03389540
8,https://hal.archives-ouvertes.fr/hal-03281509
9,https://halshs.archives-ouvertes.fr/halshs-023...
