In [1]:
import pandas as pd
import numpy as np
import requests
import json

# Notebook de contôle

## Contrôles à appliquer avant le workflow Dagster

In [2]:
observation_date = "2022-08-29"
corpus_end_year = 2022

In [18]:
df = pd.read_json(f'01_raw/{observation_date}/exportDonnees_barometre_complet_{observation_date}.json')
df.shape

(43309, 20)

### Vérif fichier bien formé

In [4]:
df[df["dc:identifiers"] == "SCOPUS_ID:84961134992"][["dc:identifiers","@auid","@afids","ce:indexed-name"]]

Unnamed: 0,dc:identifiers,@auid,@afids,ce:indexed-name
10805,SCOPUS_ID:84961134992,56656653900,60110764,Rageot M.
10806,SCOPUS_ID:84961134992,56656653900,60110763,Rageot M.
10807,SCOPUS_ID:84961134992,6603126272,60110764,Regert M.
10808,SCOPUS_ID:84961134992,7003877322,60110763,Filippi J.J.
10809,SCOPUS_ID:84961134992,7004449219,60110763,Fernandez X.


### Vérif affiliations à ajouter

In [19]:
data = df[["dc:identifiers","prism:doi","reference","annee_pub","@afids","mentionAffil_reconstruct","@auid","ce:indexed-name","corresponding_author","Is_dc:creator"]]
data.columns = ['source_id', 'doi',"scopus_title",'year', 'aff_scopus_id','aff_source_text','author_id','author_name','corresponding_author','creator_author']
df_authors = data.groupby('source_id')['author_name'].apply(list).reset_index(name='all_authors')
df_authors['all_authors'] = df_authors["all_authors"].apply('|'.join)
df_reference_data = pd.merge(data,df_authors, left_on='source_id', right_on='source_id')

In [18]:
df_affiliations = pd.read_json('03_primary/referentiel_structures.json')
set_affiliations_id = set(df_affiliations['affiliation_id'].tolist())
set_data_affiliations_id = set(df_reference_data['aff_scopus_id'].unique().tolist())
diff = set_affiliations_id.union(set_data_affiliations_id)  - set_affiliations_id.intersection(set_data_affiliations_id) 
diff

{'0', 'temp_008', 'temp_009'}

### Si diff != 1

In [20]:
#exemple pour retrouver le libellé d'un aff_scopus_id
df[df["@afids"] == "temp_009"].mentionAffil_reconstruct

18258    MajuLab, CNRS, Université de Nice, NUS-NTU Int...
19714    MajuLab., CNRS-Université Nice Sophia Antipoli...
23787    MajuLab, CNRS, Université de Nice, NUS-NTU Int...
23788    MajuLab, CNRS, Université de Nice, NUS-NTU Int...
23885    MajuLab, CNRS-UNS-NUS-NTU, Université Côte d'A...
                               ...                        
42604    MajuLab, International Joint Research Unit IRL...
42951    MajuLab, International Joint Research Unit IRL...
42952    MajuLab, International Joint Research Unit IRL...
42991    MajuLab, International Joint Research Unit UMI...
43302    CRHEA-CNRS, Rue Bernard Gregory| MajuLab, Inte...
Name: mentionAffil_reconstruct, Length: 104, dtype: object

- Ajouter à la main les nouvelles structures 03_primary/referentiel_structures.json
- Compléter avec les ids trouvés (Hal, Idref...)
- Incrémenter le champ id
- Vérifier le parent_id
- Mettre le nb documents_count à 0

### Vérifs éditeurs à ajouter

In [27]:
def keep_duplicate (row):
    return row["author_name"] +"|" + row["affiliation_name"]
crfprefix_base_url = "https://api.crossref.org/v1/prefixes/"
def crf_publisher_metadata(prefix):
    """Get the homogeneous publisher's name from a prefix doi"""
    if prefix is None:
        raise ValueError('prefix cannot be None')
    result = {}
    result["prefix"] = prefix
    try:
        requests.get(crfprefix_base_url+str(prefix))
        if requests.get(crfprefix_base_url+str(prefix)).status_code == 200:
            response = requests.get(crfprefix_base_url+str(prefix)).text
            result["publisher_by_doiprefix"] = json.loads(response).get("message")["name"]
        else:
            pass
    except:
        pass
    return result

def crf_publisher_retrieval(doiprefix_list):
    """Request function crf_publisher_metadata from a list of doi prefixs and compile in a dataframe"""
    df_result = pd.DataFrame(crf_publisher_metadata(i) for i in doiprefix_list)
    return df_result[df_result["prefix"].notna()]

In [21]:
df_affiliations = pd.read_json('03_primary/referentiel_structures.json')

In [22]:
df_affiliations["affiliation_id"] = df_affiliations["affiliation_id"].astype('str')
df_reference_data["aff_scopus_id"] = df_reference_data["aff_scopus_id"].astype('str')
publis_all_with_affiliations_data = pd.merge(df_reference_data,df_affiliations[df_affiliations["affiliation_id"].notna()], left_on='aff_scopus_id', right_on='affiliation_id',how="left").drop(columns=['affiliation_id','documents_count','ppn_valide','affcourt_valide','RNSR','VIAF','ISNI','BNF','HAL'])
publis_all_with_affiliations_data = publis_all_with_affiliations_data.rename(columns={'id': 'aff_internal_id', 'parent_id': 'aff_parent_id'})
# identify corresponding author if UCA
publis_all_with_affiliations_data["corresponding"] = publis_all_with_affiliations_data[publis_all_with_affiliations_data["corresponding_author"] == "oui"].apply (lambda row: keep_duplicate(row), axis=1)

In [23]:
publis_all_with_affiliations_data["corresponding_author"] = publis_all_with_affiliations_data["corresponding_author"].astype('category')
publis_all_with_affiliations_data["corresponding_author"] = publis_all_with_affiliations_data["corresponding_author"].cat.set_categories(['oui', 'non', 'corr absent pour cette publi'], ordered=True)
publis_all_with_affiliations_data.sort_values(by=['doi', 'corresponding_author'])
publis_uniques_doi_data = publis_all_with_affiliations_data[publis_all_with_affiliations_data.doi.notna()].drop_duplicates(subset=['doi'], keep='first')[["source_id","doi","year","corresponding","all_authors"]]
publis_uniques_doi_data = publis_uniques_doi_data[publis_uniques_doi_data.year < int(corpus_end_year)]

In [24]:
new_prefix_list = list(set([item.partition("/")[0] for item in publis_uniques_doi_data["doi"].to_list()]))
old_prefix_df = pd.read_csv('03_primary/mapping_doiprefixes_publisher.csv', sep=",",encoding='utf8')
old_prefix_list = old_prefix_df["prefix"].astype(str).to_list()
diff_prefix_list = list(set(new_prefix_list) - set(old_prefix_list))
diff_prefix_list

['10.5220',
 '10.7150',
 '10.22564',
 '10.17396',
 '10.1210',
 '10.1200',
 '10.21630',
 '10.3390',
 '10.1130',
 '10.5210',
 '10.25537',
 '10.25428',
 '10.9876',
 '10.14601',
 '10.24820',
 '10.30549',
 '10.22201',
 '10.5441',
 '10.3750',
 '10.1070',
 '10.2383',
 '10.2110',
 '10.5840',
 '10.6092',
 '10.1590',
 '10.5277',
 '10.14658',
 '10.1080',
 '10.21409',
 '10.13128',
 '10.2340',
 '10.2423',
 '10.1190',
 '10.3150',
 '10.1439',
 '10.24310',
 '10.1680',
 '10.19272',
 '10.11936',
 '10.5278',
 '10.20960',
 '10.35470',
 '10.14649',
 '10.57',
 '10.1160',
 '10.11909',
 '10.2312',
 '10.26331',
 '10.4310',
 '10.4418',
 '10.1482',
 '10.26028',
 '10.31009',
 '10.7410',
 '10.48611',
 '10.34190',
 '10.1530',
 '10.1140',
 '10.13130',
 '10.4230',
 '10.3240',
 '10.32473',
 '10.3290',
 '10.21411',
 '10.17660',
 '10.4454',
 '10.1090',
 '10.24840',
 '10.3850',
 '10.4000',
 '10.2140']

In [28]:
df_new_prefix_result = crf_publisher_retrieval(diff_prefix_list)
publishers_doi_prefix = old_prefix_df.append(df_new_prefix_result)

In [29]:
publishers_doi_prefix

Unnamed: 0,prefix,publisher_by_doiprefix
0,10.21873,Anticancer Research USA Inc.
1,10.1119,American Association of Physics Teachers (AAPT)
2,10.2166,IWA Publishing
3,10.1177,SAGE Publications
4,10.4081,PAGEPress Publications
...,...,...
66,10.1090,American Mathematical Society (AMS)
67,10.24840,University of Porto
68,10.3850,Research Publishing Services
69,10.4000,


In [30]:
publishers_doi_prefix.drop_duplicates(subset=['prefix'], keep='last').to_csv('03_primary/mapping_doiprefixes_publisher.csv', index = False,encoding='utf8')

## Divers